swr/rast: AVX512 support compiled in by default
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39
40 #include <array>
41 #include <sstream>
42
43 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
44
45 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
46 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// Store Raster Tile Function Tables.
50 //////////////////////////////////////////////////////////////////////////
51 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
54
55 void InitStoreTilesTable_Linear_1();
56 void InitStoreTilesTable_Linear_2();
57 void InitStoreTilesTable_TileX_1();
58 void InitStoreTilesTable_TileX_2();
59 void InitStoreTilesTable_TileY_1();
60 void InitStoreTilesTable_TileY_2();
61 void InitStoreTilesTable_TileW();
62 void InitStoreTilesTable();
63
64 //////////////////////////////////////////////////////////////////////////
65 /// StorePixels
66 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
67 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
68 /// @param ppDsts - Array of destination pointers. Each pointer is
69 /// to a single row of at most 16B.
70 /// @tparam NumDests - Number of destination pointers. Each pair of
71 /// pointers is for a 16-byte column of two rows.
72 //////////////////////////////////////////////////////////////////////////
73 template <size_t PixelSize, size_t NumDests>
74 struct StorePixels
75 {
76 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
77 };
78
79 //////////////////////////////////////////////////////////////////////////
80 /// StorePixels (32-bit pixel specialization)
81 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
82 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
83 /// @param ppDsts - Array of destination pointers. Each pointer is
84 /// to a single row of at most 16B.
85 /// @tparam NumDests - Number of destination pointers. Each pair of
86 /// pointers is for a 16-byte column of two rows.
87 //////////////////////////////////////////////////////////////////////////
88 template <>
89 struct StorePixels<8, 2>
90 {
91 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
92 {
93 // Each 4-pixel row is 4 bytes.
94 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
95
96 // Unswizzle from SWR-Z order
97 uint16_t* pRow = (uint16_t*)ppDsts[0];
98 pRow[0] = pPixSrc[0];
99 pRow[1] = pPixSrc[2];
100
101 pRow = (uint16_t*)ppDsts[1];
102 pRow[0] = pPixSrc[1];
103 pRow[1] = pPixSrc[3];
104 }
105 };
106
107 template <>
108 struct StorePixels<8, 4>
109 {
110 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
111 {
112 // 8 x 2 bytes = 16 bytes, 16 pixels
113 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
114
115 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
116
117 // Unswizzle from SWR-Z order
118 ppDsts16[0][0] = pSrc16[0]; // 0 1
119 ppDsts16[0][1] = pSrc16[2]; // 4 5
120
121 ppDsts16[1][0] = pSrc16[1]; // 2 3
122 ppDsts16[1][1] = pSrc16[3]; // 6 7
123
124 ppDsts16[2][0] = pSrc16[4]; // 8 9
125 ppDsts16[2][1] = pSrc16[6]; // C D
126
127 ppDsts16[3][0] = pSrc16[5]; // A B
128 ppDsts16[3][1] = pSrc16[7]; // E F
129 }
130 };
131
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts - Array of destination pointers. Each pointer is
137 /// to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers. Each pair of
139 /// pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
141 template <>
142 struct StorePixels<16, 2>
143 {
144 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145 {
146 // Each 4-pixel row is 8 bytes.
147 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148
149 // Unswizzle from SWR-Z order
150 uint32_t* pRow = (uint32_t*)ppDsts[0];
151 pRow[0] = pPixSrc[0];
152 pRow[1] = pPixSrc[2];
153
154 pRow = (uint32_t*)ppDsts[1];
155 pRow[0] = pPixSrc[1];
156 pRow[1] = pPixSrc[3];
157 }
158 };
159
160 template <>
161 struct StorePixels<16, 4>
162 {
163 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
164 {
165 // 8 x 4 bytes = 32 bytes, 16 pixels
166 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
167
168 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
169
170 // Unswizzle from SWR-Z order
171 ppDsts32[0][0] = pSrc32[0]; // 0 1
172 ppDsts32[0][1] = pSrc32[2]; // 4 5
173
174 ppDsts32[1][0] = pSrc32[1]; // 2 3
175 ppDsts32[1][1] = pSrc32[3]; // 6 7
176
177 ppDsts32[2][0] = pSrc32[4]; // 8 9
178 ppDsts32[2][1] = pSrc32[6]; // C D
179
180 ppDsts32[3][0] = pSrc32[5]; // A B
181 ppDsts32[3][1] = pSrc32[7]; // E F
182 }
183 };
184
185 //////////////////////////////////////////////////////////////////////////
186 /// StorePixels (32-bit pixel specialization)
187 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
188 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
189 /// @param ppDsts - Array of destination pointers. Each pointer is
190 /// to a single row of at most 16B.
191 /// @tparam NumDests - Number of destination pointers. Each pair of
192 /// pointers is for a 16-byte column of two rows.
193 //////////////////////////////////////////////////////////////////////////
194 template <>
195 struct StorePixels<32, 2>
196 {
197 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
198 {
199 // Each 4-pixel row is 16-bytes
200 simd4scalari *pZRow01 = (simd4scalari*)pSrc;
201 simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
202 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
203
204 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
205 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
206
207 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
208 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
209 }
210 };
211
212 template <>
213 struct StorePixels<32, 4>
214 {
215 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
216 {
217 // 4 x 16 bytes = 64 bytes, 16 pixels
218 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
219
220 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
221
222 // Unswizzle from SWR-Z order
223 simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
224 simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
225 simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
226 simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
227
228 SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
229 SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
230 SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
231 SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
232 }
233 };
234
235 //////////////////////////////////////////////////////////////////////////
236 /// StorePixels (32-bit pixel specialization)
237 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
238 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
239 /// @param ppDsts - Array of destination pointers. Each pointer is
240 /// to a single row of at most 16B.
241 /// @tparam NumDests - Number of destination pointers. Each pair of
242 /// pointers is for a 16-byte column of two rows.
243 //////////////////////////////////////////////////////////////////////////
244 template <>
245 struct StorePixels<64, 4>
246 {
247 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
248 {
249 // Each 4-pixel row is 32 bytes.
250 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
251
252 // order of pointers match SWR-Z layout
253 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
254 *pvDsts[0] = pPixSrc[0];
255 *pvDsts[1] = pPixSrc[1];
256 *pvDsts[2] = pPixSrc[2];
257 *pvDsts[3] = pPixSrc[3];
258 }
259 };
260
261 template <>
262 struct StorePixels<64, 8>
263 {
264 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
265 {
266 // 8 x 16 bytes = 128 bytes, 16 pixels
267 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
268
269 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
270
271 // order of pointers match SWR-Z layout
272 *ppDsts128[0] = pSrc128[0]; // 0 1
273 *ppDsts128[1] = pSrc128[1]; // 2 3
274 *ppDsts128[2] = pSrc128[2]; // 4 5
275 *ppDsts128[3] = pSrc128[3]; // 6 7
276 *ppDsts128[4] = pSrc128[4]; // 8 9
277 *ppDsts128[5] = pSrc128[5]; // A B
278 *ppDsts128[6] = pSrc128[6]; // C D
279 *ppDsts128[7] = pSrc128[7]; // E F
280 }
281 };
282
283 //////////////////////////////////////////////////////////////////////////
284 /// StorePixels (32-bit pixel specialization)
285 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
286 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
287 /// @param ppDsts - Array of destination pointers. Each pointer is
288 /// to a single row of at most 16B.
289 /// @tparam NumDests - Number of destination pointers. Each pair of
290 /// pointers is for a 16-byte column of two rows.
291 //////////////////////////////////////////////////////////////////////////
292 template <>
293 struct StorePixels<128, 8>
294 {
295 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
296 {
297 // Each 4-pixel row is 64 bytes.
298 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
299
300 // Unswizzle from SWR-Z order
301 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
302 *pvDsts[0] = pPixSrc[0];
303 *pvDsts[1] = pPixSrc[2];
304 *pvDsts[2] = pPixSrc[1];
305 *pvDsts[3] = pPixSrc[3];
306 *pvDsts[4] = pPixSrc[4];
307 *pvDsts[5] = pPixSrc[6];
308 *pvDsts[6] = pPixSrc[5];
309 *pvDsts[7] = pPixSrc[7];
310 }
311 };
312
313 template <>
314 struct StorePixels<128, 16>
315 {
316 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
317 {
318 // 16 x 16 bytes = 256 bytes, 16 pixels
319 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
320
321 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
322
323 for (uint32_t i = 0; i < 16; i += 4)
324 {
325 *ppDsts128[i + 0] = pSrc128[i + 0];
326 *ppDsts128[i + 1] = pSrc128[i + 2];
327 *ppDsts128[i + 2] = pSrc128[i + 1];
328 *ppDsts128[i + 3] = pSrc128[i + 3];
329 }
330 }
331 };
332
333 //////////////////////////////////////////////////////////////////////////
334 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
335 //////////////////////////////////////////////////////////////////////////
336 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
337 struct ConvertPixelsSOAtoAOS
338 {
339 //////////////////////////////////////////////////////////////////////////
340 /// @brief Converts a SIMD from the Hot Tile to the destination format
341 /// and converts from SOA to AOS.
342 /// @param pSrc - Pointer to raster tile.
343 /// @param pDst - Pointer to destination surface or deswizzling buffer.
344 template <size_t NumDests>
345 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
346 {
347 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
348
349 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
350 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
351
352 // Convert from SrcFormat --> DstFormat
353 simd16vector src;
354 LoadSOA<SrcFormat>(pSrc, src);
355 StoreSOA<DstFormat>(src, soaTile);
356
357 // Convert from SOA --> AOS
358 FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile);
359
360 // Store data into destination
361 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
362 }
363 };
364
365 //////////////////////////////////////////////////////////////////////////
366 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
367 /// Specialization for no format conversion
368 //////////////////////////////////////////////////////////////////////////
369 template<SWR_FORMAT Format>
370 struct ConvertPixelsSOAtoAOS<Format, Format>
371 {
372 //////////////////////////////////////////////////////////////////////////
373 /// @brief Converts a SIMD from the Hot Tile to the destination format
374 /// and converts from SOA to AOS.
375 /// @param pSrc - Pointer to raster tile.
376 /// @param pDst - Pointer to destination surface or deswizzling buffer.
377 template <size_t NumDests>
378 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
379 {
380 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
381
382 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
383
384 // Convert from SOA --> AOS
385 FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile);
386
387 // Store data into destination
388 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
389 }
390 };
391
392 //////////////////////////////////////////////////////////////////////////
393 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
394 //////////////////////////////////////////////////////////////////////////
395 template<>
396 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
397 {
398 //////////////////////////////////////////////////////////////////////////
399 /// @brief Converts a SIMD from the Hot Tile to the destination format
400 /// and converts from SOA to AOS.
401 /// @param pSrc - Pointer to raster tile.
402 /// @param pDst - Pointer to destination surface or deswizzling buffer.
403 template <size_t NumDests>
404 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
405 {
406 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
407 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
408
409 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
410
411 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
412
413 // Load hot-tile
414 simd16vector src, dst;
415 LoadSOA<SrcFormat>(pSrc, src);
416
417 // deswizzle
418 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
419 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
420 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
421
422 // clamp
423 dst.x = Clamp<DstFormat>(dst.x, 0);
424 dst.y = Clamp<DstFormat>(dst.y, 1);
425 dst.z = Clamp<DstFormat>(dst.z, 2);
426
427 // normalize
428 dst.x = Normalize<DstFormat>(dst.x, 0);
429 dst.y = Normalize<DstFormat>(dst.y, 1);
430 dst.z = Normalize<DstFormat>(dst.z, 2);
431
432 // pack
433 simd16scalari packed = _simd16_castps_si(dst.x);
434
435 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
436 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
437
438 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
439 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
440
441 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
442 uint32_t *pPacked = (uint32_t*)&packed;
443 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
444 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
445 {
446 *pAosTile++ = *pPacked++;
447 }
448
449 // Store data into destination
450 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
451 }
452 };
453
454 //////////////////////////////////////////////////////////////////////////
455 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
456 //////////////////////////////////////////////////////////////////////////
457 template<>
458 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
459 {
460 static const SWR_FORMAT SrcFormat = R32_FLOAT;
461 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
462
463 //////////////////////////////////////////////////////////////////////////
464 /// @brief Converts a SIMD from the Hot Tile to the destination format
465 /// and converts from SOA to AOS.
466 /// @param pSrc - Pointer to raster tile.
467 /// @param pDst - Pointer to destination surface or deswizzling buffer.
468 template <size_t NumDests>
469 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
470 {
471 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
472
473 // clamp
474 const simd16scalar zero = _simd16_setzero_ps();
475 const simd16scalar ones = _simd16_set1_ps(1.0f);
476
477 comp = _simd16_max_ps(comp, zero);
478 comp = _simd16_min_ps(comp, ones);
479
480 // normalize
481 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
482
483 simd16scalari temp = _simd16_cvtps_epi32(comp);
484
485 // swizzle
486 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
487
488 // merge/store data into destination but don't overwrite the X8 bits
489 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
490 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
491
492 simd16scalari dest = _simd16_setzero_si();
493
494 dest = _simd16_insert_si(dest, destlo, 0);
495 dest = _simd16_insert_si(dest, desthi, 1);
496
497 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
498
499 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
500
501 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
502 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
503 }
504 };
505
506 template<SWR_FORMAT DstFormat>
507 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
508 {
509 // swizzle rgba -> bgra while we load
510 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
511 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
512 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
513 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
514
515 // clamp
516 const simd16scalar zero = _simd16_setzero_ps();
517 const simd16scalar ones = _simd16_set1_ps(1.0f);
518
519 comp0 = _simd16_max_ps(comp0, zero);
520 comp0 = _simd16_min_ps(comp0, ones);
521
522 comp1 = _simd16_max_ps(comp1, zero);
523 comp1 = _simd16_min_ps(comp1, ones);
524
525 comp2 = _simd16_max_ps(comp2, zero);
526 comp2 = _simd16_min_ps(comp2, ones);
527
528 comp3 = _simd16_max_ps(comp3, zero);
529 comp3 = _simd16_min_ps(comp3, ones);
530
531 // gamma-correct only rgb
532 if (FormatTraits<DstFormat>::isSRGB)
533 {
534 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
535 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
536 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
537 }
538
539 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
540 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
541 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
542 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
543 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
544
545 // moving to 16 wide integer vector types
546 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
547 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
548 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
549 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
550
551 // SOA to AOS conversion
552 src1 = _simd16_slli_epi32(src1, 8);
553 src2 = _simd16_slli_epi32(src2, 16);
554 src3 = _simd16_slli_epi32(src3, 24);
555
556 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
557
558 // de-swizzle conversion
559 #if 1
560 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
561 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
562
563 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
564
565 #else
566 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
567
568 #endif
569 // store 8x2 memory order:
570 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
571 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
572 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
573 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
574 }
575
576 template<SWR_FORMAT DstFormat>
577 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
578 {
579 static const uint32_t offset = sizeof(simdscalar);
580
581 // swizzle rgba -> bgra while we load
582 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
583 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
584 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
585 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
586
587 // clamp
588 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
589 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
590
591 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
592 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
593
594 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
595 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
596
597 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
598 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
599
600 if (FormatTraits<DstFormat>::isSRGB)
601 {
602 // Gamma-correct only rgb
603 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
604 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
605 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
606 }
607
608 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
609 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
610 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
611 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
612 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
613
614 // moving to 8 wide integer vector types
615 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
616 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
617 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
618 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
619
620 #if KNOB_ARCH <= KNOB_ARCH_AVX
621
622 // splitting into two sets of 4 wide integer vector types
623 // because AVX doesn't have instructions to support this operation at 8 wide
624 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
625 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
626 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
627 simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
628
629 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
630 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
631 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
632 simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
633
634 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
635 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
636 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
637 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
638 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
639 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
640
641 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
642 srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
643
644 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
645 srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
646
647 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
648 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
649
650 // unpack into rows that get the tiling order correct
651 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
652 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
653
654 simdscalari final = _mm256_castsi128_si256(vRow00);
655 final = _mm256_insertf128_si256(final, vRow10, 1);
656
657 #else
658
659 // logic is as above, only wider
660 src1 = _mm256_slli_si256(src1, 1);
661 src2 = _mm256_slli_si256(src2, 2);
662 src3 = _mm256_slli_si256(src3, 3);
663
664 src0 = _mm256_or_si256(src0, src1);
665 src2 = _mm256_or_si256(src2, src3);
666
667 simdscalari final = _mm256_or_si256(src0, src2);
668
669 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
670 final = _mm256_permute4x64_epi64(final, 0xD8);
671 #endif
672
673 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
674 }
675
676 template<SWR_FORMAT DstFormat>
677 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
678 {
679 // swizzle rgba -> bgra while we load
680 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
681 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
682 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
683
684 // clamp
685 const simd16scalar zero = _simd16_setzero_ps();
686 const simd16scalar ones = _simd16_set1_ps(1.0f);
687
688 comp0 = _simd16_max_ps(comp0, zero);
689 comp0 = _simd16_min_ps(comp0, ones);
690
691 comp1 = _simd16_max_ps(comp1, zero);
692 comp1 = _simd16_min_ps(comp1, ones);
693
694 comp2 = _simd16_max_ps(comp2, zero);
695 comp2 = _simd16_min_ps(comp2, ones);
696
697 // gamma-correct only rgb
698 if (FormatTraits<DstFormat>::isSRGB)
699 {
700 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
701 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
702 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
703 }
704
705 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
706 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
707 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
708 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
709
710 // moving to 16 wide integer vector types
711 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
712 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
713 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
714
715 // SOA to AOS conversion
716 src1 = _simd16_slli_epi32(src1, 8);
717 src2 = _simd16_slli_epi32(src2, 16);
718
719 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
720
721 // de-swizzle conversion
722 #if 1
723 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
724 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
725
726 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
727
728 #else
729 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
730
731 #endif
732 // store 8x2 memory order:
733 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
734 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
735 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
736 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
737 }
738
739 template<SWR_FORMAT DstFormat>
740 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
741 {
742 static const uint32_t offset = sizeof(simdscalar);
743
744 // swizzle rgba -> bgra while we load
745 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
746 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
747 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
748 // clamp
749 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
750 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
751
752 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
753 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
754
755 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
756 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
757
758 if (FormatTraits<DstFormat>::isSRGB)
759 {
760 // Gamma-correct only rgb
761 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
762 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
763 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
764 }
765
766 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
767 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
768 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
769 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
770
771 // moving to 8 wide integer vector types
772 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
773 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
774 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
775
776 #if KNOB_ARCH <= KNOB_ARCH_AVX
777
778 // splitting into two sets of 4 wide integer vector types
779 // because AVX doesn't have instructions to support this operation at 8 wide
780 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
781 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
782 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
783
784 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
785 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
786 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
787
788 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
789 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
790 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
791 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
792
793 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
794
795 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
796
797 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
798 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
799
800 // unpack into rows that get the tiling order correct
801 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
802 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
803
804 simdscalari final = _mm256_castsi128_si256(vRow00);
805 final = _mm256_insertf128_si256(final, vRow10, 1);
806
807 #else
808
809 // logic is as above, only wider
810 src1 = _mm256_slli_si256(src1, 1);
811 src2 = _mm256_slli_si256(src2, 2);
812
813 src0 = _mm256_or_si256(src0, src1);
814
815 simdscalari final = _mm256_or_si256(src0, src2);
816
817 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
818 final = _mm256_permute4x64_epi64(final, 0xD8);
819
820 #endif
821
822 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
823 }
824
825 template<>
826 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
827 {
828 template <size_t NumDests>
829 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
830 {
831 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
832 }
833 };
834
835 template<>
836 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
837 {
838 template <size_t NumDests>
839 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
840 {
841 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
842 }
843 };
844
845 template<>
846 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
847 {
848 template <size_t NumDests>
849 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
850 {
851 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
852 }
853 };
854
855 template<>
856 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
857 {
858 template <size_t NumDests>
859 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
860 {
861 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
862 }
863 };
864
865 template<>
866 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
867 {
868 template <size_t NumDests>
869 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
870 {
871 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
872 }
873 };
874
875 template<>
876 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
877 {
878 template <size_t NumDests>
879 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
880 {
881 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
882 }
883 };
884
885 template<>
886 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
887 {
888 template <size_t NumDests>
889 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
890 {
891 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
892 }
893 };
894
895 template<>
896 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
897 {
898 template <size_t NumDests>
899 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
900 {
901 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
902 }
903 };
904
905 //////////////////////////////////////////////////////////////////////////
906 /// StoreRasterTile
907 //////////////////////////////////////////////////////////////////////////
908 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
909 struct StoreRasterTile
910 {
911 //////////////////////////////////////////////////////////////////////////
912 /// @brief Retrieve color from hot tile source which is always float.
913 /// @param pSrc - Pointer to raster tile.
914 /// @param x, y - Coordinates to raster tile.
915 /// @param output - output color
916 INLINE static void GetSwizzledSrcColor(
917 uint8_t* pSrc,
918 uint32_t x, uint32_t y,
919 float outputColor[4])
920 {
921 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
922
923 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
924
925 // Compute which simd tile we're accessing within 8x8 tile.
926 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
927 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
928
929 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
930
931 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
932
933 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
934 }
935
936 //////////////////////////////////////////////////////////////////////////
937 /// @brief Stores an 8x8 raster tile to the destination surface.
938 /// @param pSrc - Pointer to raster tile.
939 /// @param pDstSurface - Destination surface state
940 /// @param x, y - Coordinates to raster tile.
941 INLINE static void Store(
942 uint8_t *pSrc,
943 SWR_SURFACE_STATE* pDstSurface,
944 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
945 {
946 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
947 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
948
949 // For each raster tile pixel (rx, ry)
950 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
951 {
952 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
953 {
954 // Perform bounds checking.
955 if (((x + rx) < lodWidth) &&
956 ((y + ry) < lodHeight))
957 {
958 float srcColor[4];
959 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
960
961 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
962 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
963 sampleNum, pDstSurface->lod, pDstSurface);
964 {
965 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
966 }
967 }
968 }
969 }
970 }
971
972 //////////////////////////////////////////////////////////////////////////
973 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
974 /// @param pSrc - Pointer to raster tile.
975 /// @param pDstSurface - Destination surface state
976 /// @param x, y - Coordinates to raster tile.
977 /// @param sampleOffset - Offset between adjacent multisamples
978 INLINE static void Resolve(
979 uint8_t *pSrc,
980 SWR_SURFACE_STATE* pDstSurface,
981 uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
982 {
983 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
984 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
985
986 float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
987
988 // For each raster tile pixel (rx, ry)
989 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
990 {
991 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
992 {
993 // Perform bounds checking.
994 if (((x + rx) < lodWidth) &&
995 ((y + ry) < lodHeight))
996 {
997 // Sum across samples
998 float resolveColor[4] = {0};
999 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1000 {
1001 float sampleColor[4] = {0};
1002 uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1003 GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1004 resolveColor[0] += sampleColor[0];
1005 resolveColor[1] += sampleColor[1];
1006 resolveColor[2] += sampleColor[2];
1007 resolveColor[3] += sampleColor[3];
1008 }
1009
1010 // Divide by numSamples to average
1011 resolveColor[0] *= oneOverNumSamples;
1012 resolveColor[1] *= oneOverNumSamples;
1013 resolveColor[2] *= oneOverNumSamples;
1014 resolveColor[3] *= oneOverNumSamples;
1015
1016 // Use the resolve surface state
1017 SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1018 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1019 pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1020 0, pResolveSurface->lod, pResolveSurface);
1021 {
1022 ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1023 }
1024 }
1025 }
1026 }
1027 }
1028
1029 };
1030
1031 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1032 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1033 {};
1034
1035 //////////////////////////////////////////////////////////////////////////
1036 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1037 //////////////////////////////////////////////////////////////////////////
1038 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1039 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1040 {
1041 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1042 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1043 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1044
1045 //////////////////////////////////////////////////////////////////////////
1046 /// @brief Stores an 8x8 raster tile to the destination surface.
1047 /// @param pSrc - Pointer to raster tile.
1048 /// @param pDstSurface - Destination surface state
1049 /// @param x, y - Coordinates to raster tile.
1050 INLINE static void Store(
1051 uint8_t *pSrc,
1052 SWR_SURFACE_STATE* pDstSurface,
1053 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1054 {
1055 // Punt non-full tiles to generic store
1056 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1057 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1058
1059 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1060 {
1061 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1062 }
1063
1064 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1065 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1066
1067 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1068 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1069
1070 uint8_t* ppDsts[] =
1071 {
1072 pDst, // row 0, col 0
1073 pDst + pDstSurface->pitch, // row 1, col 0
1074 pDst + dx / 2, // row 0, col 1
1075 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1076 };
1077
1078 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1079 {
1080 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1081 {
1082 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1083
1084 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1085
1086 ppDsts[0] += dx;
1087 ppDsts[1] += dx;
1088 ppDsts[2] += dx;
1089 ppDsts[3] += dx;
1090 }
1091
1092 ppDsts[0] += dy;
1093 ppDsts[1] += dy;
1094 ppDsts[2] += dy;
1095 ppDsts[3] += dy;
1096 }
1097 }
1098 };
1099
1100 //////////////////////////////////////////////////////////////////////////
1101 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1102 //////////////////////////////////////////////////////////////////////////
1103 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1104 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1105 {
1106 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1107 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1108 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1109
1110 //////////////////////////////////////////////////////////////////////////
1111 /// @brief Stores an 8x8 raster tile to the destination surface.
1112 /// @param pSrc - Pointer to raster tile.
1113 /// @param pDstSurface - Destination surface state
1114 /// @param x, y - Coordinates to raster tile.
1115 INLINE static void Store(
1116 uint8_t *pSrc,
1117 SWR_SURFACE_STATE* pDstSurface,
1118 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1119 {
1120 // Punt non-full tiles to generic store
1121 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1122 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1123
1124 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1125 {
1126 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1127 }
1128
1129 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1130 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1131
1132 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1133 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1134
1135 uint8_t* ppDsts[] =
1136 {
1137 pDst, // row 0, col 0
1138 pDst + pDstSurface->pitch, // row 1, col 0
1139 pDst + dx / 2, // row 0, col 1
1140 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1141 };
1142
1143 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1144 {
1145 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1146 {
1147 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1148
1149 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1150
1151 ppDsts[0] += dx;
1152 ppDsts[1] += dx;
1153 ppDsts[2] += dx;
1154 ppDsts[3] += dx;
1155 }
1156
1157 ppDsts[0] += dy;
1158 ppDsts[1] += dy;
1159 ppDsts[2] += dy;
1160 ppDsts[3] += dy;
1161 }
1162 }
1163 };
1164
1165 //////////////////////////////////////////////////////////////////////////
1166 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1167 //////////////////////////////////////////////////////////////////////////
1168 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1169 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1170 {
1171 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1172 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1173 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1174
1175 //////////////////////////////////////////////////////////////////////////
1176 /// @brief Stores an 8x8 raster tile to the destination surface.
1177 /// @param pSrc - Pointer to raster tile.
1178 /// @param pDstSurface - Destination surface state
1179 /// @param x, y - Coordinates to raster tile.
1180 INLINE static void Store(
1181 uint8_t *pSrc,
1182 SWR_SURFACE_STATE* pDstSurface,
1183 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1184 {
1185 // Punt non-full tiles to generic store
1186 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1187 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1188
1189 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1190 {
1191 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1192 }
1193
1194 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1195 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1196
1197 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1198 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1199
1200 uint8_t* ppDsts[] =
1201 {
1202 pDst, // row 0, col 0
1203 pDst + pDstSurface->pitch, // row 1, col 0
1204 pDst + dx / 2, // row 0, col 1
1205 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1206 };
1207
1208 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1209 {
1210 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1211 {
1212 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1213
1214 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1215
1216 ppDsts[0] += dx;
1217 ppDsts[1] += dx;
1218 ppDsts[2] += dx;
1219 ppDsts[3] += dx;
1220 }
1221
1222 ppDsts[0] += dy;
1223 ppDsts[1] += dy;
1224 ppDsts[2] += dy;
1225 ppDsts[3] += dy;
1226 }
1227 }
1228 };
1229
1230 //////////////////////////////////////////////////////////////////////////
1231 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1232 //////////////////////////////////////////////////////////////////////////
1233 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1234 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1235 {
1236 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1237 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1238 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1239 static const size_t MAX_DST_COLUMN_BYTES = 16;
1240
1241 //////////////////////////////////////////////////////////////////////////
1242 /// @brief Stores an 8x8 raster tile to the destination surface.
1243 /// @param pSrc - Pointer to raster tile.
1244 /// @param pDstSurface - Destination surface state
1245 /// @param x, y - Coordinates to raster tile.
1246 INLINE static void Store(
1247 uint8_t *pSrc,
1248 SWR_SURFACE_STATE* pDstSurface,
1249 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1250 {
1251 // Punt non-full tiles to generic store
1252 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1253 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1254
1255 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1256 {
1257 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1258 }
1259
1260 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1261 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1262
1263 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1264 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1265
1266 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1267 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1268
1269 uint8_t *ppDsts[] =
1270 {
1271 pDst, // row 0, col 0
1272 pDst + pDstSurface->pitch, // row 1, col 0
1273 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1274 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1275 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1276 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1277 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1278 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1279 };
1280
1281 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1282 {
1283 // Raster tile width is same as simd16 tile width
1284 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1285
1286 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1287
1288 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1289
1290 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1291 {
1292 ppDsts[i] += dy;
1293 }
1294 }
1295 }
1296 };
1297
1298 //////////////////////////////////////////////////////////////////////////
1299 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1300 //////////////////////////////////////////////////////////////////////////
1301 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1302 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1303 {
1304 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1305 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1306 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1307 static const size_t MAX_DST_COLUMN_BYTES = 16;
1308
1309 //////////////////////////////////////////////////////////////////////////
1310 /// @brief Stores an 8x8 raster tile to the destination surface.
1311 /// @param pSrc - Pointer to raster tile.
1312 /// @param pDstSurface - Destination surface state
1313 /// @param x, y - Coordinates to raster tile.
1314 INLINE static void Store(
1315 uint8_t *pSrc,
1316 SWR_SURFACE_STATE* pDstSurface,
1317 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1318 {
1319 // Punt non-full tiles to generic store
1320 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1321 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1322
1323 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1324 {
1325 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1326 }
1327
1328 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1329 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1330
1331 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1332 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1333
1334 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1335 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1336
1337 uint8_t* ppDsts[] =
1338 {
1339 pDst, // row 0, col 0
1340 pDst + pDstSurface->pitch, // row 1, col 0
1341 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1342 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1343 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1344 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1345 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1346 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1347 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1348 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1349 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1350 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1351 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1352 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1353 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1354 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1355 };
1356
1357 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1358 {
1359 // Raster tile width is same as simd16 tile width
1360 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1361
1362 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1363
1364 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1365
1366 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1367 {
1368 ppDsts[i] += dy;
1369 }
1370 }
1371 }
1372 };
1373
1374 //////////////////////////////////////////////////////////////////////////
1375 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1376 //////////////////////////////////////////////////////////////////////////
1377 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1378 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1379 {
1380 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1381 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1382
1383 //////////////////////////////////////////////////////////////////////////
1384 /// @brief Stores an 8x8 raster tile to the destination surface.
1385 /// @param pSrc - Pointer to raster tile.
1386 /// @param pDstSurface - Destination surface state
1387 /// @param x, y - Coordinates to raster tile.
1388 INLINE static void Store(
1389 uint8_t *pSrc,
1390 SWR_SURFACE_STATE* pDstSurface,
1391 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1392 {
1393 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1394
1395 // Punt non-full tiles to generic store
1396 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1397 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1398
1399 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1400 {
1401 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1402 }
1403
1404 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1405 // We can compute the offsets to each column within the raster tile once and increment from these.
1406 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1407 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1408 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1409
1410 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1411
1412 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1413 uint8_t *ppDsts[] =
1414 {
1415 pDst,
1416 pDst + DestRowWidthBytes,
1417 pDst + DestRowWidthBytes / 4,
1418 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1419 };
1420
1421 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1422 {
1423 // Raster tile width is same as simd16 tile width
1424 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1425
1426 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1427
1428 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1429
1430 ppDsts[0] += dy;
1431 ppDsts[1] += dy;
1432 ppDsts[2] += dy;
1433 ppDsts[3] += dy;
1434 }
1435 }
1436 };
1437
1438 //////////////////////////////////////////////////////////////////////////
1439 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1440 //////////////////////////////////////////////////////////////////////////
1441 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1442 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1443 {
1444 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1445 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1446
1447 //////////////////////////////////////////////////////////////////////////
1448 /// @brief Stores an 8x8 raster tile to the destination surface.
1449 /// @param pSrc - Pointer to raster tile.
1450 /// @param pDstSurface - Destination surface state
1451 /// @param x, y - Coordinates to raster tile.
1452 INLINE static void Store(
1453 uint8_t *pSrc,
1454 SWR_SURFACE_STATE* pDstSurface,
1455 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1456 {
1457 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1458
1459 // Punt non-full tiles to generic store
1460 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1461 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1462
1463 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1464 {
1465 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1466 }
1467
1468 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1469 // We can compute the offsets to each column within the raster tile once and increment from these.
1470 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1471 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1472 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1473
1474 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1475
1476 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1477 uint8_t *ppDsts[] =
1478 {
1479 pDst,
1480 pDst + DestRowWidthBytes,
1481 pDst + DestRowWidthBytes / 2,
1482 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1483 };
1484
1485 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1486 {
1487 // Raster tile width is same as simd16 tile width
1488 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1489
1490 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1491
1492 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1493
1494 ppDsts[0] += dy;
1495 ppDsts[1] += dy;
1496 ppDsts[2] += dy;
1497 ppDsts[3] += dy;
1498 }
1499 }
1500 };
1501
1502 //////////////////////////////////////////////////////////////////////////
1503 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1504 //////////////////////////////////////////////////////////////////////////
1505 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1506 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1507 {
1508 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1509 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1510 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1511
1512 //////////////////////////////////////////////////////////////////////////
1513 /// @brief Stores an 8x8 raster tile to the destination surface.
1514 /// @param pSrc - Pointer to raster tile.
1515 /// @param pDstSurface - Destination surface state
1516 /// @param x, y - Coordinates to raster tile.
1517 INLINE static void Store(
1518 uint8_t *pSrc,
1519 SWR_SURFACE_STATE* pDstSurface,
1520 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1521 {
1522 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1523
1524 // Punt non-full tiles to generic store
1525 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1526 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1527
1528 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1529 {
1530 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1531 }
1532
1533 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1534 // We can compute the offsets to each column within the raster tile once and increment from these.
1535 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1536 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1537
1538 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1539 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1540
1541 uint8_t* ppDsts[] =
1542 {
1543 pDst, // row 0, col 0
1544 pDst + DestRowWidthBytes, // row 1, col 0
1545 pDst + dx / 2, // row 0, col 1
1546 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1547 };
1548
1549 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1550 {
1551 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1552 {
1553 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1554
1555 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1556
1557 ppDsts[0] += dx;
1558 ppDsts[1] += dx;
1559 ppDsts[2] += dx;
1560 ppDsts[3] += dx;
1561 }
1562
1563 ppDsts[0] += dy;
1564 ppDsts[1] += dy;
1565 ppDsts[2] += dy;
1566 ppDsts[3] += dy;
1567 }
1568 }
1569 };
1570
1571 //////////////////////////////////////////////////////////////////////////
1572 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1573 //////////////////////////////////////////////////////////////////////////
1574 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1575 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1576 {
1577 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1578 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1579
1580 //////////////////////////////////////////////////////////////////////////
1581 /// @brief Stores an 8x8 raster tile to the destination surface.
1582 /// @param pSrc - Pointer to raster tile.
1583 /// @param pDstSurface - Destination surface state
1584 /// @param x, y - Coordinates to raster tile.
1585 INLINE static void Store(
1586 uint8_t *pSrc,
1587 SWR_SURFACE_STATE* pDstSurface,
1588 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1589 {
1590 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1591 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1592
1593 // Punt non-full tiles to generic store
1594 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1595 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1596
1597 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1598 {
1599 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1600 }
1601
1602 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1603 // We can compute the offsets to each column within the raster tile once and increment from these.
1604 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1605 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1606 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1607
1608 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1609 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1610
1611 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1612 uint8_t *ppDsts[] =
1613 {
1614 pDst, // row 0, col 0
1615 pDst + DestRowWidthBytes, // row 1, col 0
1616 pDst + DestColumnBytes, // row 0, col 1
1617 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
1618 };
1619
1620 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1621 {
1622 // Raster tile width is same as simd16 tile width
1623 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1624
1625 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1626
1627 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1628
1629 ppDsts[0] += dy;
1630 ppDsts[1] += dy;
1631 ppDsts[2] += dy;
1632 ppDsts[3] += dy;
1633 }
1634 }
1635 };
1636
1637 //////////////////////////////////////////////////////////////////////////
1638 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1639 //////////////////////////////////////////////////////////////////////////
1640 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1641 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
1642 {
1643 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
1644 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1645
1646 //////////////////////////////////////////////////////////////////////////
1647 /// @brief Stores an 8x8 raster tile to the destination surface.
1648 /// @param pSrc - Pointer to raster tile.
1649 /// @param pDstSurface - Destination surface state
1650 /// @param x, y - Coordinates to raster tile.
1651 INLINE static void Store(
1652 uint8_t *pSrc,
1653 SWR_SURFACE_STATE* pDstSurface,
1654 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1655 {
1656 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1657 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1658
1659 // Punt non-full tiles to generic store
1660 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1661 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1662
1663 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1664 {
1665 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1666 }
1667
1668 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1669 // We can compute the offsets to each column within the raster tile once and increment from these.
1670 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1671 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1672 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1673
1674 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1675 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1676
1677 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1678 uint8_t *ppDsts[] =
1679 {
1680 pDst, // row 0, col 0
1681 pDst + DestRowWidthBytes, // row 1, col 0
1682 pDst + DestColumnBytes, // row 0, col 1
1683 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
1684 pDst + DestColumnBytes * 2, // row 0, col 2
1685 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1686 pDst + DestColumnBytes * 3, // row 0, col 3
1687 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
1688 };
1689
1690 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1691 {
1692 // Raster tile width is same as simd16 tile width
1693 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1694
1695 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1696
1697 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1698
1699 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1700 {
1701 ppDsts[i] += dy;
1702 }
1703 }
1704 }
1705 };
1706
1707 //////////////////////////////////////////////////////////////////////////
1708 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1709 //////////////////////////////////////////////////////////////////////////
1710 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1711 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
1712 {
1713 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
1714 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1715
1716 //////////////////////////////////////////////////////////////////////////
1717 /// @brief Stores an 8x8 raster tile to the destination surface.
1718 /// @param pSrc - Pointer to raster tile.
1719 /// @param pDstSurface - Destination surface state
1720 /// @param x, y - Coordinates to raster tile.
1721 INLINE static void Store(
1722 uint8_t *pSrc,
1723 SWR_SURFACE_STATE* pDstSurface,
1724 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1725 {
1726 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1727 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1728
1729 // Punt non-full tiles to generic store
1730 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1731 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1732
1733 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1734 {
1735 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1736 }
1737
1738 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1739 // We can compute the offsets to each column within the raster tile once and increment from these.
1740 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1741 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1742 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1743
1744 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1745 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1746
1747 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1748 uint8_t *ppDsts[] =
1749 {
1750 pDst, // row 0, col 0
1751 pDst + DestRowWidthBytes, // row 1, col 0
1752 pDst + DestColumnBytes, // row 0, col 1
1753 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
1754 pDst + DestColumnBytes * 2, // row 0, col 2
1755 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1756 pDst + DestColumnBytes * 3, // row 0, col 3
1757 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
1758 pDst + DestColumnBytes * 4, // row 0, col 4
1759 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
1760 pDst + DestColumnBytes * 5, // row 0, col 5
1761 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
1762 pDst + DestColumnBytes * 6, // row 0, col 6
1763 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
1764 pDst + DestColumnBytes * 7, // row 0, col 7
1765 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
1766 };
1767
1768 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1769 {
1770 // Raster tile width is same as simd16 tile width
1771 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1772
1773 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1774
1775 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1776
1777 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1778 {
1779 ppDsts[i] += dy;
1780 }
1781 }
1782 }
1783 };
1784
1785 //////////////////////////////////////////////////////////////////////////
1786 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1787 //////////////////////////////////////////////////////////////////////////
1788 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1789 struct StoreMacroTile
1790 {
1791 //////////////////////////////////////////////////////////////////////////
1792 /// @brief Stores a macrotile to the destination surface using safe implementation.
1793 /// @param pSrc - Pointer to macro tile.
1794 /// @param pDstSurface - Destination surface state
1795 /// @param x, y - Coordinates to macro tile
1796 static void StoreGeneric(
1797 uint8_t *pSrcHotTile,
1798 SWR_SURFACE_STATE* pDstSurface,
1799 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1800 {
1801 PFN_STORE_TILES_INTERNAL pfnStore;
1802 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1803
1804 // Store each raster tile from the hot tile to the destination surface.
1805 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1806 {
1807 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1808 {
1809 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1810 {
1811 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1812 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1813 }
1814 }
1815 }
1816
1817 }
1818
1819 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
1820 //////////////////////////////////////////////////////////////////////////
1821 /// @brief Stores a macrotile to the destination surface.
1822 /// @param pSrc - Pointer to macro tile.
1823 /// @param pDstSurface - Destination surface state
1824 /// @param x, y - Coordinates to macro tile
1825 static void Store(
1826 uint8_t *pSrcHotTile,
1827 SWR_SURFACE_STATE* pDstSurface,
1828 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1829 {
1830 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
1831
1832 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1833 {
1834 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
1835 0,
1836 0,
1837 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
1838 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
1839 sampleNum,
1840 pDstSurface->lod,
1841 pDstSurface);
1842
1843 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1844 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
1845 (pDstSurface->bInterleavedSamples);
1846
1847 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1848 }
1849
1850 // Save original for pSrcHotTile resolve.
1851 uint8_t *pResolveSrcHotTile = pSrcHotTile;
1852
1853 // Store each raster tile from the hot tile to the destination surface.
1854 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1855 {
1856 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1857 {
1858 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1859 {
1860 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1861 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1862 }
1863 }
1864 }
1865
1866 if (pDstSurface->xpAuxBaseAddress)
1867 {
1868 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1869 // Store each raster tile from the hot tile to the destination surface.
1870 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1871 {
1872 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1873 {
1874 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
1875 pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
1876 }
1877 }
1878 }
1879 }
1880 };
1881
1882 //////////////////////////////////////////////////////////////////////////
1883 /// InitStoreTilesTable - Helper for setting up the tables.
1884 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1885 void InitStoreTilesTableColor_Half1(
1886 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
1887 {
1888 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
1889 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
1890 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
1891 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
1892 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
1893 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
1894 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
1895 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
1896 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
1897 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
1898 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
1899 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
1900 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
1901 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
1902 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
1903 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
1904 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
1905 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
1906 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
1907 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
1908 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
1909 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
1910 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
1911 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
1912 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
1913 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
1914 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
1915 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
1916 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
1917 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
1918 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
1919 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
1920 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
1921 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
1922 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
1923 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
1924 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
1925 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
1926 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
1927 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
1928 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
1929 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
1930 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
1931 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
1932 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
1933 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
1934 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
1935 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
1936 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
1937 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
1938 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
1939 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
1940 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
1941 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
1942 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
1943 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
1944 }
1945
1946 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1947 void InitStoreTilesTableColor_Half2(
1948 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
1949 {
1950 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
1951 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
1952 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
1953 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
1954 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
1955 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
1956 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
1957 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
1958 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
1959 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
1960 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
1961 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
1962 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
1963 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
1964 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
1965 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
1966 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
1967 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
1968 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
1969 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
1970 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
1971 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
1972 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
1973 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
1974 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
1975 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
1976 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
1977 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
1978 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
1979 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
1980 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
1981 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
1982 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
1983 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
1984 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
1985 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
1986 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
1987 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
1988 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
1989 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
1990 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
1991 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
1992 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
1993 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
1994 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
1995 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
1996 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
1997 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
1998 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
1999 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2000 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2001 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2002 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2003 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2004 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2005 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2006 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2007 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2008 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2009 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2010 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2011 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2012 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2013 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2014 }
2015
2016 //////////////////////////////////////////////////////////////////////////
2017 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2018 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2019 void InitStoreTilesTableDepth(
2020 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2021 {
2022 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2023 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2024 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2025 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2026 }
2027
2028 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2029 void InitStoreTilesTableStencil(
2030 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2031 {
2032 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2033 }