gallium: Remove every double semi-colon
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.cpp
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #include "common/os.h"
29 #include "common/formats.h"
30 #include "core/context.h"
31 #include "core/rdtsc_core.h"
32 #include "core/format_conversion.h"
33
34 #include "memory/TilingFunctions.h"
35 #include "memory/tilingtraits.h"
36 #include "memory/Convert.h"
37 #include "core/multisample.h"
38
39 #include <array>
40 #include <sstream>
41
42 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
43
44 //////////////////////////////////////////////////////////////////////////
45 /// Store Raster Tile Function Tables.
46 //////////////////////////////////////////////////////////////////////////
47 static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
48 static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
49 static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
50
51 //////////////////////////////////////////////////////////////////////////
52 /// StorePixels
53 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
54 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
55 /// @param ppDsts - Array of destination pointers. Each pointer is
56 /// to a single row of at most 16B.
57 /// @tparam NumDests - Number of destination pointers. Each pair of
58 /// pointers is for a 16-byte column of two rows.
59 //////////////////////////////////////////////////////////////////////////
60 template <size_t PixelSize, size_t NumDests>
61 struct StorePixels
62 {
63 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
64 };
65
66 //////////////////////////////////////////////////////////////////////////
67 /// StorePixels (32-bit pixel specialization)
68 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
69 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
70 /// @param ppDsts - Array of destination pointers. Each pointer is
71 /// to a single row of at most 16B.
72 /// @tparam NumDests - Number of destination pointers. Each pair of
73 /// pointers is for a 16-byte column of two rows.
74 //////////////////////////////////////////////////////////////////////////
75 template <>
76 struct StorePixels<8, 2>
77 {
78 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
79 {
80 // Each 4-pixel row is 4 bytes.
81 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
82
83 // Unswizzle from SWR-Z order
84 uint16_t* pRow = (uint16_t*)ppDsts[0];
85 pRow[0] = pPixSrc[0];
86 pRow[1] = pPixSrc[2];
87
88 pRow = (uint16_t*)ppDsts[1];
89 pRow[0] = pPixSrc[1];
90 pRow[1] = pPixSrc[3];
91 }
92 };
93
94 //////////////////////////////////////////////////////////////////////////
95 /// StorePixels (32-bit pixel specialization)
96 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
97 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
98 /// @param ppDsts - Array of destination pointers. Each pointer is
99 /// to a single row of at most 16B.
100 /// @tparam NumDests - Number of destination pointers. Each pair of
101 /// pointers is for a 16-byte column of two rows.
102 //////////////////////////////////////////////////////////////////////////
103 template <>
104 struct StorePixels<16, 2>
105 {
106 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
107 {
108 // Each 4-pixel row is 8 bytes.
109 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
110
111 // Unswizzle from SWR-Z order
112 uint32_t* pRow = (uint32_t*)ppDsts[0];
113 pRow[0] = pPixSrc[0];
114 pRow[1] = pPixSrc[2];
115
116 pRow = (uint32_t*)ppDsts[1];
117 pRow[0] = pPixSrc[1];
118 pRow[1] = pPixSrc[3];
119 }
120 };
121
122 //////////////////////////////////////////////////////////////////////////
123 /// StorePixels (32-bit pixel specialization)
124 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
125 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
126 /// @param ppDsts - Array of destination pointers. Each pointer is
127 /// to a single row of at most 16B.
128 /// @tparam NumDests - Number of destination pointers. Each pair of
129 /// pointers is for a 16-byte column of two rows.
130 //////////////////////////////////////////////////////////////////////////
131 template <>
132 struct StorePixels<32, 2>
133 {
134 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
135 {
136 // Each 4-pixel row is 16-bytes
137 __m128i *pZRow01 = (__m128i*)pSrc;
138 __m128i vQuad00 = _mm_load_si128(pZRow01);
139 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
140
141 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
142 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
143
144 _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
145 _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
146 }
147 };
148
149 //////////////////////////////////////////////////////////////////////////
150 /// StorePixels (32-bit pixel specialization)
151 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
152 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
153 /// @param ppDsts - Array of destination pointers. Each pointer is
154 /// to a single row of at most 16B.
155 /// @tparam NumDests - Number of destination pointers. Each pair of
156 /// pointers is for a 16-byte column of two rows.
157 //////////////////////////////////////////////////////////////////////////
158 template <>
159 struct StorePixels<64, 4>
160 {
161 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
162 {
163 // Each 4-pixel row is 32 bytes.
164 const __m128i* pPixSrc = (const __m128i*)pSrc;
165
166 // order of pointers match SWR-Z layout
167 __m128i** pvDsts = (__m128i**)&ppDsts[0];
168 *pvDsts[0] = pPixSrc[0];
169 *pvDsts[1] = pPixSrc[1];
170 *pvDsts[2] = pPixSrc[2];
171 *pvDsts[3] = pPixSrc[3];
172 }
173 };
174
175 //////////////////////////////////////////////////////////////////////////
176 /// StorePixels (32-bit pixel specialization)
177 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
178 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
179 /// @param ppDsts - Array of destination pointers. Each pointer is
180 /// to a single row of at most 16B.
181 /// @tparam NumDests - Number of destination pointers. Each pair of
182 /// pointers is for a 16-byte column of two rows.
183 //////////////////////////////////////////////////////////////////////////
184 template <>
185 struct StorePixels<128, 8>
186 {
187 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
188 {
189 // Each 4-pixel row is 64 bytes.
190 const __m128i* pPixSrc = (const __m128i*)pSrc;
191
192 // Unswizzle from SWR-Z order
193 __m128i** pvDsts = (__m128i**)&ppDsts[0];
194 *pvDsts[0] = pPixSrc[0];
195 *pvDsts[1] = pPixSrc[2];
196 *pvDsts[2] = pPixSrc[1];
197 *pvDsts[3] = pPixSrc[3];
198 *pvDsts[4] = pPixSrc[4];
199 *pvDsts[5] = pPixSrc[6];
200 *pvDsts[6] = pPixSrc[5];
201 *pvDsts[7] = pPixSrc[7];
202 }
203 };
204
205 //////////////////////////////////////////////////////////////////////////
206 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
207 //////////////////////////////////////////////////////////////////////////
208 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
209 struct ConvertPixelsSOAtoAOS
210 {
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Converts a SIMD from the Hot Tile to the destination format
213 /// and converts from SOA to AOS.
214 /// @param pSrc - Pointer to raster tile.
215 /// @param pDst - Pointer to destination surface or deswizzling buffer.
216 template <size_t NumDests>
217 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
218 {
219 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
220
221 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
222 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
223
224 // Convert from SrcFormat --> DstFormat
225 simdvector src;
226 LoadSOA<SrcFormat>(pSrc, src);
227 StoreSOA<DstFormat>(src, soaTile);
228
229 // Convert from SOA --> AOS
230 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
231
232 // Store data into destination
233 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
234 }
235 };
236
237 //////////////////////////////////////////////////////////////////////////
238 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
239 /// Specialization for no format conversion
240 //////////////////////////////////////////////////////////////////////////
241 template<SWR_FORMAT Format>
242 struct ConvertPixelsSOAtoAOS<Format, Format>
243 {
244 //////////////////////////////////////////////////////////////////////////
245 /// @brief Converts a SIMD from the Hot Tile to the destination format
246 /// and converts from SOA to AOS.
247 /// @param pSrc - Pointer to raster tile.
248 /// @param pDst - Pointer to destination surface or deswizzling buffer.
249 template <size_t NumDests>
250 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
251 {
252 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
253
254 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
255
256 // Convert from SOA --> AOS
257 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
258
259 // Store data into destination
260 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
261 }
262 };
263
264 //////////////////////////////////////////////////////////////////////////
265 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
266 //////////////////////////////////////////////////////////////////////////
267 template<>
268 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
269 {
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Converts a SIMD from the Hot Tile to the destination format
272 /// and converts from SOA to AOS.
273 /// @param pSrc - Pointer to raster tile.
274 /// @param pDst - Pointer to destination surface or deswizzling buffer.
275 template <size_t NumDests>
276 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
277 {
278 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
279 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
280 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
281
282 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
283
284 // Load hot-tile
285 simdvector src, dst;
286 LoadSOA<SrcFormat>(pSrc, src);
287
288 // deswizzle
289 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
290 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
291 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
292
293 // clamp
294 dst.x = Clamp<DstFormat>(dst.x, 0);
295 dst.y = Clamp<DstFormat>(dst.y, 1);
296 dst.z = Clamp<DstFormat>(dst.z, 2);
297
298 // normalize
299 dst.x = Normalize<DstFormat>(dst.x, 0);
300 dst.y = Normalize<DstFormat>(dst.y, 1);
301 dst.z = Normalize<DstFormat>(dst.z, 2);
302
303 // pack
304 simdscalari packed = _simd_castps_si(dst.x);
305 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
306 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
307 FormatTraits<DstFormat>::GetBPC(1)));
308
309 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
310 uint32_t *pPacked = (uint32_t*)&packed;
311 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
312 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
313 {
314 *pAosTile++ = *pPacked++;
315 }
316
317 // Store data into destination
318 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
319 }
320 };
321
322 //////////////////////////////////////////////////////////////////////////
323 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
324 //////////////////////////////////////////////////////////////////////////
325 template<>
326 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
327 {
328 static const SWR_FORMAT SrcFormat = R32_FLOAT;
329 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
330
331 //////////////////////////////////////////////////////////////////////////
332 /// @brief Converts a SIMD from the Hot Tile to the destination format
333 /// and converts from SOA to AOS.
334 /// @param pSrc - Pointer to raster tile.
335 /// @param pDst - Pointer to destination surface or deswizzling buffer.
336 template <size_t NumDests>
337 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
338 {
339 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
340
341 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
342 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
343
344 // Convert from SrcFormat --> DstFormat
345 simdvector src;
346 LoadSOA<SrcFormat>(pSrc, src);
347 StoreSOA<DstFormat>(src, soaTile);
348
349 // Convert from SOA --> AOS
350 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
351
352 // Store data into destination but don't overwrite the X8 bits
353 // Each 4-pixel row is 16-bytes
354 __m128i *pZRow01 = (__m128i*)aosTile;
355 __m128i vQuad00 = _mm_load_si128(pZRow01);
356 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
357
358 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
359 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
360
361 __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
362 __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
363
364 __m128i vMask = _mm_set1_epi32(0xFFFFFF);
365
366 vDst0 = _mm_andnot_si128(vMask, vDst0);
367 vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
368 vDst1 = _mm_andnot_si128(vMask, vDst1);
369 vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
370
371 _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
372 _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
373 }
374 };
375
376 template<SWR_FORMAT DstFormat>
377 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
378 {
379 static const uint32_t offset = sizeof(simdscalar);
380
381 // swizzle rgba -> bgra while we load
382 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
383 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
384 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
385 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
386
387 // clamp
388 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
389 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
390
391 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
392 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
393
394 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
395 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
396
397 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
398 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
399
400 if (FormatTraits<DstFormat>::isSRGB)
401 {
402 // Gamma-correct only rgb
403 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
404 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
405 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
406 }
407
408 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
409 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
410 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
411 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
412 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
413
414 // moving to 8 wide integer vector types
415 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
416 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
417 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
418 __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
419
420 #if KNOB_ARCH == KNOB_ARCH_AVX
421
422 // splitting into two sets of 4 wide integer vector types
423 // because AVX doesn't have instructions to support this operation at 8 wide
424 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
425 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
426 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
427 __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
428
429 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
430 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
431 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
432 __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
433
434 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
435 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
436 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
437 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
438 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
439 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
440
441 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
442 srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
443
444 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
445 srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
446
447 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
448 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
449
450 // unpack into rows that get the tiling order correct
451 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
452 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
453
454 __m256i final = _mm256_castsi128_si256(vRow00);
455 final = _mm256_insertf128_si256(final, vRow10, 1);
456
457 #elif KNOB_ARCH == KNOB_ARCH_AVX2
458
459 // logic is as above, only wider
460 src1 = _mm256_slli_si256(src1, 1);
461 src2 = _mm256_slli_si256(src2, 2);
462 src3 = _mm256_slli_si256(src3, 3);
463
464 src0 = _mm256_or_si256(src0, src1);
465 src2 = _mm256_or_si256(src2, src3);
466
467 __m256i final = _mm256_or_si256(src0, src2);
468
469 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
470 final = _mm256_permute4x64_epi64(final, 0xD8);
471
472 #endif
473
474 _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
475 }
476
477 template<SWR_FORMAT DstFormat>
478 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
479 {
480 static const uint32_t offset = sizeof(simdscalar);
481
482 // swizzle rgba -> bgra while we load
483 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
484 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
485 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
486 // clamp
487 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
488 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
489
490 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
491 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
492
493 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
494 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
495
496 if (FormatTraits<DstFormat>::isSRGB)
497 {
498 // Gamma-correct only rgb
499 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
500 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
501 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
502 }
503
504 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
505 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
506 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
507 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
508
509 // moving to 8 wide integer vector types
510 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
511 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
512 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
513
514 #if KNOB_ARCH == KNOB_ARCH_AVX
515
516 // splitting into two sets of 4 wide integer vector types
517 // because AVX doesn't have instructions to support this operation at 8 wide
518 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
519 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
520 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
521
522 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
523 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
524 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
525
526 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
527 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
528 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
529 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
530
531 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
532
533 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
534
535 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
536 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
537
538 // unpack into rows that get the tiling order correct
539 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
540 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
541
542 __m256i final = _mm256_castsi128_si256(vRow00);
543 final = _mm256_insertf128_si256(final, vRow10, 1);
544
545 #elif KNOB_ARCH == KNOB_ARCH_AVX2
546
547 // logic is as above, only wider
548 src1 = _mm256_slli_si256(src1, 1);
549 src2 = _mm256_slli_si256(src2, 2);
550
551 src0 = _mm256_or_si256(src0, src1);
552
553 __m256i final = _mm256_or_si256(src0, src2);
554
555 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
556 final = _mm256_permute4x64_epi64(final, 0xD8);
557
558 #endif
559
560 _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
561 }
562
563 template<>
564 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
565 {
566 template <size_t NumDests>
567 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
568 {
569 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
570 }
571 };
572
573 template<>
574 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
575 {
576 template <size_t NumDests>
577 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
578 {
579 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
580 }
581 };
582
583 template<>
584 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
585 {
586 template <size_t NumDests>
587 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
588 {
589 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
590 }
591 };
592
593 template<>
594 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
595 {
596 template <size_t NumDests>
597 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
598 {
599 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
600 }
601 };
602
603 template<>
604 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
605 {
606 template <size_t NumDests>
607 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
608 {
609 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
610 }
611 };
612
613 template<>
614 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
615 {
616 template <size_t NumDests>
617 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
618 {
619 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
620 }
621 };
622
623 template<>
624 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
625 {
626 template <size_t NumDests>
627 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
628 {
629 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
630 }
631 };
632
633 template<>
634 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
635 {
636 template <size_t NumDests>
637 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
638 {
639 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
640 }
641 };
642
643 //////////////////////////////////////////////////////////////////////////
644 /// StoreRasterTile
645 //////////////////////////////////////////////////////////////////////////
646 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
647 struct StoreRasterTile
648 {
649 //////////////////////////////////////////////////////////////////////////
650 /// @brief Retrieve color from hot tile source which is always float.
651 /// @param pSrc - Pointer to raster tile.
652 /// @param x, y - Coordinates to raster tile.
653 /// @param output - output color
654 INLINE static void GetSwizzledSrcColor(
655 uint8_t* pSrc,
656 uint32_t x, uint32_t y,
657 float outputColor[4])
658 {
659 typedef SimdTile<SrcFormat, DstFormat> SimdT;
660
661 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
662
663 // Compute which simd tile we're accessing within 8x8 tile.
664 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
665 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
666
667 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
668
669 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
670
671 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
672 }
673
674 //////////////////////////////////////////////////////////////////////////
675 /// @brief Stores an 8x8 raster tile to the destination surface.
676 /// @param pSrc - Pointer to raster tile.
677 /// @param pDstSurface - Destination surface state
678 /// @param x, y - Coordinates to raster tile.
679 INLINE static void Store(
680 uint8_t *pSrc,
681 SWR_SURFACE_STATE* pDstSurface,
682 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
683 {
684 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
685 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
686
687 // For each raster tile pixel (rx, ry)
688 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
689 {
690 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
691 {
692 // Perform bounds checking.
693 if (((x + rx) < lodWidth) &&
694 ((y + ry) < lodHeight))
695 {
696 float srcColor[4];
697 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
698
699 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false>((x + rx), (y + ry),
700 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
701 sampleNum, pDstSurface->lod, pDstSurface);
702 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
703 }
704 }
705 }
706 }
707 };
708
709 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
710 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
711 {};
712
713 //////////////////////////////////////////////////////////////////////////
714 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
715 //////////////////////////////////////////////////////////////////////////
716 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
717 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat >
718 {
719 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
720 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
721 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
722
723 //////////////////////////////////////////////////////////////////////////
724 /// @brief Stores an 8x8 raster tile to the destination surface.
725 /// @param pSrc - Pointer to raster tile.
726 /// @param pDstSurface - Destination surface state
727 /// @param x, y - Coordinates to raster tile.
728 INLINE static void Store(
729 uint8_t *pSrc,
730 SWR_SURFACE_STATE* pDstSurface,
731 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
732 {
733 // Punt non-full tiles to generic store
734 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
735 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
736 if (x + KNOB_TILE_X_DIM > lodWidth ||
737 y + KNOB_TILE_Y_DIM > lodHeight)
738 {
739 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
740 }
741
742 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
743 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
744 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
745
746 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
747 {
748 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
749
750 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
751 {
752 // Format conversion and convert from SOA to AOS, and store the rows.
753 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
754
755 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
756 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
757 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
758 }
759
760 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
761 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
762 }
763 }
764 };
765
766 //////////////////////////////////////////////////////////////////////////
767 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
768 //////////////////////////////////////////////////////////////////////////
769 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
770 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat >
771 {
772 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
773 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
774 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
775
776 //////////////////////////////////////////////////////////////////////////
777 /// @brief Stores an 8x8 raster tile to the destination surface.
778 /// @param pSrc - Pointer to raster tile.
779 /// @param pDstSurface - Destination surface state
780 /// @param x, y - Coordinates to raster tile.
781 INLINE static void Store(
782 uint8_t *pSrc,
783 SWR_SURFACE_STATE* pDstSurface,
784 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
785 {
786 // Punt non-full tiles to generic store
787 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
788 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
789 if (x + KNOB_TILE_X_DIM > lodWidth ||
790 y + KNOB_TILE_Y_DIM > lodHeight)
791 {
792 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
793 }
794
795 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
796 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
797 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
798
799 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
800 {
801 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
802
803 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
804 {
805 // Format conversion and convert from SOA to AOS, and store the rows.
806 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
807
808 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
809 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
810 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
811 }
812
813 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
814 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
815 }
816 }
817 };
818
819 //////////////////////////////////////////////////////////////////////////
820 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
821 //////////////////////////////////////////////////////////////////////////
822 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
823 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat >
824 {
825 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
826 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
827 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
828
829 //////////////////////////////////////////////////////////////////////////
830 /// @brief Stores an 8x8 raster tile to the destination surface.
831 /// @param pSrc - Pointer to raster tile.
832 /// @param pDstSurface - Destination surface state
833 /// @param x, y - Coordinates to raster tile.
834 INLINE static void Store(
835 uint8_t *pSrc,
836 SWR_SURFACE_STATE* pDstSurface,
837 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
838 {
839 // Punt non-full tiles to generic store
840 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
841 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
842 if (x + KNOB_TILE_X_DIM > lodWidth ||
843 y + KNOB_TILE_Y_DIM > lodHeight)
844 {
845 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
846 }
847
848 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
849 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
850 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
851
852 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
853 {
854 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
855
856 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
857 {
858 // Format conversion and convert from SOA to AOS, and store the rows.
859 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
860
861 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
862 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
863 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
864 }
865
866 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
867 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
868 }
869 }
870 };
871
872 //////////////////////////////////////////////////////////////////////////
873 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
874 //////////////////////////////////////////////////////////////////////////
875 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
876 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat >
877 {
878 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
879 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
880 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
881 static const size_t MAX_DST_COLUMN_BYTES = 16;
882 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
883 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
884
885 //////////////////////////////////////////////////////////////////////////
886 /// @brief Stores an 8x8 raster tile to the destination surface.
887 /// @param pSrc - Pointer to raster tile.
888 /// @param pDstSurface - Destination surface state
889 /// @param x, y - Coordinates to raster tile.
890 INLINE static void Store(
891 uint8_t *pSrc,
892 SWR_SURFACE_STATE* pDstSurface,
893 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
894 {
895 // Punt non-full tiles to generic store
896 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
897 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
898 if (x + KNOB_TILE_X_DIM > lodWidth ||
899 y + KNOB_TILE_Y_DIM > lodHeight)
900 {
901 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
902 }
903
904 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
905 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
906 uint8_t* ppDsts[] =
907 {
908 pDst, // row 0, col 0
909 pDst + pDstSurface->pitch, // row 1, col 0
910 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
911 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
912 };
913
914 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
915 {
916 uint8_t* ppStartRows[] =
917 {
918 ppDsts[0],
919 ppDsts[1],
920 ppDsts[2],
921 ppDsts[3],
922 };
923
924 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
925 {
926 // Format conversion and convert from SOA to AOS, and store the rows.
927 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
928
929 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
930 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
931 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
932 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
933 pSrc += SRC_COLUMN_BYTES;
934 }
935
936 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
937 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
938 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
939 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
940 }
941 }
942 };
943
944 //////////////////////////////////////////////////////////////////////////
945 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
946 //////////////////////////////////////////////////////////////////////////
947 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
948 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat >
949 {
950 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
951 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
952 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
953 static const size_t MAX_DST_COLUMN_BYTES = 16;
954 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
955 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
956
957 //////////////////////////////////////////////////////////////////////////
958 /// @brief Stores an 8x8 raster tile to the destination surface.
959 /// @param pSrc - Pointer to raster tile.
960 /// @param pDstSurface - Destination surface state
961 /// @param x, y - Coordinates to raster tile.
962 INLINE static void Store(
963 uint8_t *pSrc,
964 SWR_SURFACE_STATE* pDstSurface,
965 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
966 {
967 // Punt non-full tiles to generic store
968 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
969 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
970 if (x + KNOB_TILE_X_DIM > lodWidth ||
971 y + KNOB_TILE_Y_DIM > lodHeight)
972 {
973 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
974 }
975
976 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
977 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
978 struct DstPtrs
979 {
980 uint8_t* ppDsts[8];
981 } ptrs;
982
983 // Need 8 pointers, 4 columns of 2 rows each
984 for (uint32_t y = 0; y < 2; ++y)
985 {
986 for (uint32_t x = 0; x < 4; ++x)
987 {
988 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
989 }
990 }
991
992 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
993 {
994 DstPtrs startPtrs = ptrs;
995
996 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
997 {
998 // Format conversion and convert from SOA to AOS, and store the rows.
999 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1000
1001 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1002 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1003 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1004 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1005 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1006 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1007 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1008 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1009 pSrc += SRC_COLUMN_BYTES;
1010 }
1011
1012 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1013 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1014 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1015 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1016 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1017 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1018 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1019 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1020 }
1021 }
1022 };
1023
1024 //////////////////////////////////////////////////////////////////////////
1025 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1026 //////////////////////////////////////////////////////////////////////////
1027 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1028 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat >
1029 {
1030 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1031
1032 //////////////////////////////////////////////////////////////////////////
1033 /// @brief Stores an 8x8 raster tile to the destination surface.
1034 /// @param pSrc - Pointer to raster tile.
1035 /// @param pDstSurface - Destination surface state
1036 /// @param x, y - Coordinates to raster tile.
1037 INLINE static void Store(
1038 uint8_t *pSrc,
1039 SWR_SURFACE_STATE* pDstSurface,
1040 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1041 {
1042 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1043
1044 // Punt non-full tiles to generic store
1045 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1046 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1047 if (x + KNOB_TILE_X_DIM > lodWidth ||
1048 y + KNOB_TILE_Y_DIM > lodHeight)
1049 {
1050 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1051 }
1052
1053 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1054 // We can compute the offsets to each column within the raster tile once and increment from these.
1055 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1056 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1057 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1058
1059 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1060 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1061
1062 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1063 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1064 {
1065 uint32_t rowOffset = row * DestRowWidthBytes;
1066
1067 uint8_t* pRow = pCol0 + rowOffset;
1068 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1069
1070 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1071 pSrc += pSrcInc;
1072
1073 ppDsts[0] += DestRowWidthBytes / 4;
1074 ppDsts[1] += DestRowWidthBytes / 4;
1075
1076 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1077 pSrc += pSrcInc;
1078 }
1079 }
1080 };
1081
1082 //////////////////////////////////////////////////////////////////////////
1083 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1084 //////////////////////////////////////////////////////////////////////////
1085 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1086 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat >
1087 {
1088 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1089
1090 //////////////////////////////////////////////////////////////////////////
1091 /// @brief Stores an 8x8 raster tile to the destination surface.
1092 /// @param pSrc - Pointer to raster tile.
1093 /// @param pDstSurface - Destination surface state
1094 /// @param x, y - Coordinates to raster tile.
1095 INLINE static void Store(
1096 uint8_t *pSrc,
1097 SWR_SURFACE_STATE* pDstSurface,
1098 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1099 {
1100 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1101
1102 // Punt non-full tiles to generic store
1103 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1104 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1105 if (x + KNOB_TILE_X_DIM > lodWidth ||
1106 y + KNOB_TILE_Y_DIM > lodHeight)
1107 {
1108 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1109 }
1110
1111 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1112 // We can compute the offsets to each column within the raster tile once and increment from these.
1113 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1114 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1115 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1116
1117 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1118 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1119
1120 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1121 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1122 {
1123 uint32_t rowOffset = row * DestRowWidthBytes;
1124
1125 uint8_t* pRow = pCol0 + rowOffset;
1126 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1127
1128 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1129 pSrc += pSrcInc;
1130
1131 ppDsts[0] += DestRowWidthBytes / 2;
1132 ppDsts[1] += DestRowWidthBytes / 2;
1133
1134 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1135 pSrc += pSrcInc;
1136 }
1137 }
1138 };
1139
1140 //////////////////////////////////////////////////////////////////////////
1141 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1142 //////////////////////////////////////////////////////////////////////////
1143 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1144 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat >
1145 {
1146 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1147
1148 //////////////////////////////////////////////////////////////////////////
1149 /// @brief Stores an 8x8 raster tile to the destination surface.
1150 /// @param pSrc - Pointer to raster tile.
1151 /// @param pDstSurface - Destination surface state
1152 /// @param x, y - Coordinates to raster tile.
1153 INLINE static void Store(
1154 uint8_t *pSrc,
1155 SWR_SURFACE_STATE* pDstSurface,
1156 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1157 {
1158 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1159
1160 // Punt non-full tiles to generic store
1161 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1162 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1163 if (x + KNOB_TILE_X_DIM > lodWidth ||
1164 y + KNOB_TILE_Y_DIM > lodHeight)
1165 {
1166 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1167 }
1168
1169 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1170 // We can compute the offsets to each column within the raster tile once and increment from these.
1171 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1172 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1173 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1174
1175 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1176 {
1177 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1178 {
1179 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1180
1181 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1182 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1183
1184 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1185 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1186 }
1187
1188 pRow0 += (DestRowWidthBytes * 2);
1189 pRow1 += (DestRowWidthBytes * 2);
1190 }
1191 }
1192 };
1193
1194 //////////////////////////////////////////////////////////////////////////
1195 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1196 //////////////////////////////////////////////////////////////////////////
1197 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1198 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat >
1199 {
1200 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1201
1202 //////////////////////////////////////////////////////////////////////////
1203 /// @brief Stores an 8x8 raster tile to the destination surface.
1204 /// @param pSrc - Pointer to raster tile.
1205 /// @param pDstSurface - Destination surface state
1206 /// @param x, y - Coordinates to raster tile.
1207 INLINE static void Store(
1208 uint8_t *pSrc,
1209 SWR_SURFACE_STATE* pDstSurface,
1210 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1211 {
1212 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1213 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1214
1215 // Punt non-full tiles to generic store
1216 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1217 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1218 if (x + KNOB_TILE_X_DIM > lodWidth ||
1219 y + KNOB_TILE_Y_DIM > lodHeight)
1220 {
1221 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1222 }
1223
1224 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1225 // We can compute the offsets to each column within the raster tile once and increment from these.
1226 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1227 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1228 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1229
1230 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1231 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1232
1233 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1234 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1235 {
1236 uint32_t rowOffset = row * DestRowWidthBytes;
1237
1238 uint8_t* pRow = pCol0 + rowOffset;
1239 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1240
1241 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1242 pSrc += pSrcInc;
1243
1244 ppDsts[0] += DestColumnBytes;
1245 ppDsts[1] += DestColumnBytes;
1246
1247 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1248 pSrc += pSrcInc;
1249 }
1250 }
1251 };
1252
1253 //////////////////////////////////////////////////////////////////////////
1254 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1255 //////////////////////////////////////////////////////////////////////////
1256 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1257 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat >
1258 {
1259 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
1260
1261 //////////////////////////////////////////////////////////////////////////
1262 /// @brief Stores an 8x8 raster tile to the destination surface.
1263 /// @param pSrc - Pointer to raster tile.
1264 /// @param pDstSurface - Destination surface state
1265 /// @param x, y - Coordinates to raster tile.
1266 INLINE static void Store(
1267 uint8_t *pSrc,
1268 SWR_SURFACE_STATE* pDstSurface,
1269 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1270 {
1271 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1272 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1273
1274 // Punt non-full tiles to generic store
1275 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1276 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1277 if (x + KNOB_TILE_X_DIM > lodWidth ||
1278 y + KNOB_TILE_Y_DIM > lodHeight)
1279 {
1280 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1281 }
1282
1283 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1284 // We can compute the offsets to each column within the raster tile once and increment from these.
1285 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1286 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1287 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1288 uint8_t* pCol1 = pCol0 + DestColumnBytes;
1289
1290 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
1291 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1292 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1293
1294 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1295 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1296 {
1297 uint32_t rowOffset = row * DestRowWidthBytes;
1298 uint8_t* ppDsts[] =
1299 {
1300 pCol0 + rowOffset,
1301 pCol0 + rowOffset + DestRowWidthBytes,
1302 pCol1 + rowOffset,
1303 pCol1 + rowOffset + DestRowWidthBytes,
1304 };
1305
1306 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1307 pSrc += pSrcInc;
1308
1309 ppDsts[0] += DestColumnBytes * 2;
1310 ppDsts[1] += DestColumnBytes * 2;
1311 ppDsts[2] += DestColumnBytes * 2;
1312 ppDsts[3] += DestColumnBytes * 2;
1313
1314 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1315 pSrc += pSrcInc;
1316 }
1317 }
1318 };
1319
1320 //////////////////////////////////////////////////////////////////////////
1321 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1322 //////////////////////////////////////////////////////////////////////////
1323 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1324 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat >
1325 {
1326 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1327
1328 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
1329 static const size_t TILE_Y_ROWS = 32;
1330 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
1331
1332 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1333 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1334 static const size_t MAX_DST_COLUMN_BYTES = 16;
1335
1336 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1337 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
1338
1339 //////////////////////////////////////////////////////////////////////////
1340 /// @brief Stores an 8x8 raster tile to the destination surface.
1341 /// @param pSrc - Pointer to raster tile.
1342 /// @param pDstSurface - Destination surface state
1343 /// @param x, y - Coordinates to raster tile.
1344 INLINE static void Store(
1345 uint8_t *pSrc,
1346 SWR_SURFACE_STATE* pDstSurface,
1347 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1348 {
1349 // Punt non-full tiles to generic store
1350 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1351 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1352 if (x + KNOB_TILE_X_DIM > lodWidth ||
1353 y + KNOB_TILE_Y_DIM > lodHeight)
1354 {
1355 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1356 }
1357
1358 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1359 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1360 struct DstPtrs
1361 {
1362 uint8_t* ppDsts[8];
1363 } ptrs;
1364
1365 // Need 8 pointers, 4 columns of 2 rows each
1366 for (uint32_t y = 0; y < 2; ++y)
1367 {
1368 for (uint32_t x = 0; x < 4; ++x)
1369 {
1370 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
1371 }
1372 }
1373
1374 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1375 {
1376 DstPtrs startPtrs = ptrs;
1377
1378 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1379 {
1380 // Format conversion and convert from SOA to AOS, and store the rows.
1381 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1382
1383 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1384 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1385 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1386 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1387 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1388 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1389 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1390 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1391 pSrc += SRC_COLUMN_BYTES;
1392 }
1393
1394 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
1395 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
1396 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
1397 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
1398 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
1399 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
1400 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
1401 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
1402 }
1403 }
1404 };
1405
1406 //////////////////////////////////////////////////////////////////////////
1407 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1408 //////////////////////////////////////////////////////////////////////////
1409 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1410 struct StoreMacroTile
1411 {
1412 //////////////////////////////////////////////////////////////////////////
1413 /// @brief Stores a macrotile to the destination surface using safe implementation.
1414 /// @param pSrc - Pointer to macro tile.
1415 /// @param pDstSurface - Destination surface state
1416 /// @param x, y - Coordinates to macro tile
1417 static void StoreGeneric(
1418 uint8_t *pSrcHotTile,
1419 SWR_SURFACE_STATE* pDstSurface,
1420 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1421 {
1422 // Store each raster tile from the hot tile to the destination surface.
1423 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1424 {
1425 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1426 {
1427 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1428 {
1429 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum,
1430 renderTargetArrayIndex);
1431 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1432 }
1433 }
1434 }
1435 }
1436
1437 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
1438 //////////////////////////////////////////////////////////////////////////
1439 /// @brief Stores a macrotile to the destination surface.
1440 /// @param pSrc - Pointer to macro tile.
1441 /// @param pDstSurface - Destination surface state
1442 /// @param x, y - Coordinates to macro tile
1443 static void Store(
1444 uint8_t *pSrcHotTile,
1445 SWR_SURFACE_STATE* pDstSurface,
1446 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1447 {
1448 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
1449 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1450 {
1451 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false>(
1452 0,
1453 0,
1454 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
1455 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
1456 sampleNum,
1457 pDstSurface->lod,
1458 pDstSurface);
1459
1460 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1461 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || (pDstSurface->bInterleavedSamples);
1462
1463 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1464 }
1465
1466 // Store each raster tile from the hot tile to the destination surface.
1467 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1468 {
1469 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1470 {
1471 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1472 {
1473 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1474 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1475 }
1476 }
1477 }
1478 }
1479 };
1480
1481 static void BUCKETS_START(UINT id)
1482 {
1483 #ifdef KNOB_ENABLE_RDTSC
1484 gBucketMgr.StartBucket(id);
1485 #endif
1486 }
1487
1488 static void BUCKETS_STOP(UINT id)
1489 {
1490 #ifdef KNOB_ENABLE_RDTSC
1491 gBucketMgr.StopBucket(id);
1492 #endif
1493 }
1494
1495 // on demand buckets for store tiles
1496 static std::mutex sBucketMutex;
1497 static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1);
1498
1499 //////////////////////////////////////////////////////////////////////////
1500 /// @brief Deswizzles and stores a full hottile to a render surface
1501 /// @param hPrivateContext - Handle to private DC
1502 /// @param srcFormat - Format for hot tile.
1503 /// @param renderTargetIndex - Index to destination render target
1504 /// @param x, y - Coordinates to raster tile.
1505 /// @param pSrcHotTile - Pointer to Hot Tile
1506 void StoreHotTile(
1507 SWR_SURFACE_STATE *pDstSurface,
1508 SWR_FORMAT srcFormat,
1509 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
1510 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
1511 uint8_t *pSrcHotTile)
1512 {
1513 if (pDstSurface->type == SURFACE_NULL)
1514 {
1515 return;
1516 }
1517
1518 // force 0 if requested renderTargetArrayIndex is OOB
1519 if (renderTargetArrayIndex >= pDstSurface->depth)
1520 {
1521 renderTargetArrayIndex = 0;
1522 }
1523
1524 PFN_STORE_TILES pfnStoreTiles = nullptr;
1525
1526 if ((renderTargetIndex <= SWR_ATTACHMENT_COLOR7) && (pDstSurface->tileMode != SWR_TILE_MODE_WMAJOR))
1527 {
1528 pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format];
1529 }
1530 else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
1531 {
1532 pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format];
1533 }
1534 else
1535 {
1536 pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format];
1537 }
1538
1539 if(nullptr == pfnStoreTiles)
1540 {
1541 SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles");
1542 return;
1543 }
1544
1545 // Store a macro tile
1546 #ifdef KNOB_ENABLE_RDTSC
1547 if (sBuckets[pDstSurface->format] == -1)
1548 {
1549 // guard sBuckets update since storetiles is called by multiple threads
1550 sBucketMutex.lock();
1551 if (sBuckets[pDstSurface->format] == -1)
1552 {
1553 const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format);
1554 BUCKET_DESC desc{info.name, "", false, 0xffffffff};
1555 sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc);
1556 }
1557 sBucketMutex.unlock();
1558 }
1559 #endif
1560
1561 BUCKETS_START(sBuckets[pDstSurface->format]);
1562 pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex);
1563 BUCKETS_STOP(sBuckets[pDstSurface->format]);
1564 }
1565
1566 //////////////////////////////////////////////////////////////////////////
1567 /// InitStoreTilesTable - Helper for setting up the tables.
1568 template <SWR_TILE_MODE TileModeT, size_t NumTileModesT, size_t ArraySizeT>
1569 void InitStoreTilesTableColor(
1570 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
1571 {
1572 table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
1573 table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
1574 table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
1575 table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
1576 table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
1577 table[TileModeT][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
1578 table[TileModeT][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
1579 table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
1580 table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
1581 table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
1582 table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
1583 table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
1584 table[TileModeT][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
1585 table[TileModeT][R32G32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
1586 table[TileModeT][R32G32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
1587 table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
1588 table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
1589 table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
1590 table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
1591
1592 // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
1593 table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
1594 table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
1595 table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
1596
1597 table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
1598 table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
1599 table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
1600 table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
1601 table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
1602 table[TileModeT][R16G16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
1603 table[TileModeT][R16G16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
1604 table[TileModeT][R16G16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
1605 table[TileModeT][R16G16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
1606 table[TileModeT][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
1607
1608 // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
1609 table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
1610 table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
1611 table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
1612
1613 table[TileModeT][R32_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
1614 table[TileModeT][R32_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
1615 table[TileModeT][R32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
1616 table[TileModeT][A32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
1617 table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
1618 table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
1619 table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
1620 table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
1621
1622 // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
1623 table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
1624 table[TileModeT][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
1625 table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
1626 table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
1627 table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
1628 table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
1629 table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
1630
1631 table[TileModeT][R8G8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
1632 table[TileModeT][R8G8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
1633 table[TileModeT][R8G8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
1634 table[TileModeT][R8G8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
1635 table[TileModeT][R16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
1636 table[TileModeT][R16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
1637 table[TileModeT][R16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
1638 table[TileModeT][R16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
1639 table[TileModeT][R16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
1640 table[TileModeT][A16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
1641 table[TileModeT][A16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
1642
1643 // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
1644 table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
1645 table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
1646
1647 table[TileModeT][R8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
1648 table[TileModeT][R8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
1649 table[TileModeT][R8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
1650 table[TileModeT][R8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
1651 table[TileModeT][A8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
1652 table[TileModeT][BC1_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM>::Store;
1653 table[TileModeT][BC2_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM>::Store;
1654 table[TileModeT][BC3_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM>::Store;
1655 table[TileModeT][BC4_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_UNORM>::Store;
1656 table[TileModeT][BC5_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_UNORM>::Store;
1657 table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store;
1658 table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store;
1659 table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store;
1660 table[TileModeT][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
1661 table[TileModeT][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
1662 table[TileModeT][BC4_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_SNORM>::Store;
1663 table[TileModeT][BC5_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_SNORM>::Store;
1664 table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
1665 table[TileModeT][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
1666 table[TileModeT][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
1667 table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
1668 table[TileModeT][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
1669 table[TileModeT][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
1670
1671 // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
1672 table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
1673 table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
1674 table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
1675 table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
1676 table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
1677
1678 table[TileModeT][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
1679 table[TileModeT][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
1680 }
1681
1682 //////////////////////////////////////////////////////////////////////////
1683 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
1684 template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
1685 void InitStoreTilesTableDepth(
1686 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
1687 {
1688 table[TileModeT][R32_FLOAT] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R32_FLOAT>::Store;
1689 table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
1690 table[TileModeT][R16_UNORM] = StoreMacroTile<TilingTraits<TileModeT, 16>, R32_FLOAT, R16_UNORM>::Store;
1691 }
1692
1693 template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
1694 void InitStoreTilesTableStencil(
1695 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
1696 {
1697 table[TileModeT][R8_UINT] = StoreMacroTile<TilingTraits<TileModeT, 8>, R8_UINT, R8_UINT>::Store;
1698 }
1699
1700 //////////////////////////////////////////////////////////////////////////
1701 /// @brief Sets up tables for StoreTile
1702 void InitSimStoreTilesTable()
1703 {
1704 memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor));
1705 memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth));
1706
1707 InitStoreTilesTableColor<SWR_TILE_NONE>(sStoreTilesTableColor);
1708 InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth);
1709 InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil);
1710
1711 InitStoreTilesTableColor<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor);
1712 InitStoreTilesTableColor<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor);
1713
1714 InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth);
1715 InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil);
1716 }