1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Functionality for Store.
27 ******************************************************************************/
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "memory/SurfaceState.h"
39 #include "core/multisample.h"
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
46 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
47 typedef void(*PFN_STORE_TILES
)(uint8_t*, SWR_SURFACE_STATE
*, uint32_t, uint32_t, uint32_t);
49 //////////////////////////////////////////////////////////////////////////
50 /// Store Raster Tile Function Tables.
51 //////////////////////////////////////////////////////////////////////////
52 extern PFN_STORE_TILES sStoreTilesTableColor
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
53 extern PFN_STORE_TILES sStoreTilesTableDepth
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
54 extern PFN_STORE_TILES sStoreTilesTableStencil
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
56 void InitStoreTilesTable_Linear_1();
57 void InitStoreTilesTable_Linear_2();
58 void InitStoreTilesTable_TileX_1();
59 void InitStoreTilesTable_TileX_2();
60 void InitStoreTilesTable_TileY_1();
61 void InitStoreTilesTable_TileY_2();
62 void InitStoreTilesTable_TileW();
63 void InitStoreTilesTable();
65 //////////////////////////////////////////////////////////////////////////
67 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
68 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
69 /// @param ppDsts - Array of destination pointers. Each pointer is
70 /// to a single row of at most 16B.
71 /// @tparam NumDests - Number of destination pointers. Each pair of
72 /// pointers is for a 16-byte column of two rows.
73 //////////////////////////////////////////////////////////////////////////
74 template <size_t PixelSize
, size_t NumDests
>
77 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
]) = delete;
80 //////////////////////////////////////////////////////////////////////////
81 /// StorePixels (32-bit pixel specialization)
82 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
83 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
84 /// @param ppDsts - Array of destination pointers. Each pointer is
85 /// to a single row of at most 16B.
86 /// @tparam NumDests - Number of destination pointers. Each pair of
87 /// pointers is for a 16-byte column of two rows.
88 //////////////////////////////////////////////////////////////////////////
90 struct StorePixels
<8, 2>
92 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
94 // Each 4-pixel row is 4 bytes.
95 const uint16_t* pPixSrc
= (const uint16_t*)pSrc
;
97 // Unswizzle from SWR-Z order
98 uint16_t* pRow
= (uint16_t*)ppDsts
[0];
100 pRow
[1] = pPixSrc
[2];
102 pRow
= (uint16_t*)ppDsts
[1];
103 pRow
[0] = pPixSrc
[1];
104 pRow
[1] = pPixSrc
[3];
109 struct StorePixels
<8, 4>
111 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
113 // 8 x 2 bytes = 16 bytes, 16 pixels
114 const uint16_t *pSrc16
= reinterpret_cast<const uint16_t *>(pSrc
);
116 uint16_t **ppDsts16
= reinterpret_cast<uint16_t **>(ppDsts
);
118 // Unswizzle from SWR-Z order
119 ppDsts16
[0][0] = pSrc16
[0]; // 0 1
120 ppDsts16
[0][1] = pSrc16
[2]; // 4 5
122 ppDsts16
[1][0] = pSrc16
[1]; // 2 3
123 ppDsts16
[1][1] = pSrc16
[3]; // 6 7
125 ppDsts16
[2][0] = pSrc16
[4]; // 8 9
126 ppDsts16
[2][1] = pSrc16
[6]; // C D
128 ppDsts16
[3][0] = pSrc16
[5]; // A B
129 ppDsts16
[3][1] = pSrc16
[7]; // E F
133 //////////////////////////////////////////////////////////////////////////
134 /// StorePixels (32-bit pixel specialization)
135 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
136 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
137 /// @param ppDsts - Array of destination pointers. Each pointer is
138 /// to a single row of at most 16B.
139 /// @tparam NumDests - Number of destination pointers. Each pair of
140 /// pointers is for a 16-byte column of two rows.
141 //////////////////////////////////////////////////////////////////////////
143 struct StorePixels
<16, 2>
145 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
147 // Each 4-pixel row is 8 bytes.
148 const uint32_t* pPixSrc
= (const uint32_t*)pSrc
;
150 // Unswizzle from SWR-Z order
151 uint32_t* pRow
= (uint32_t*)ppDsts
[0];
152 pRow
[0] = pPixSrc
[0];
153 pRow
[1] = pPixSrc
[2];
155 pRow
= (uint32_t*)ppDsts
[1];
156 pRow
[0] = pPixSrc
[1];
157 pRow
[1] = pPixSrc
[3];
162 struct StorePixels
<16, 4>
164 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32
= reinterpret_cast<const uint32_t *>(pSrc
);
169 uint32_t **ppDsts32
= reinterpret_cast<uint32_t **>(ppDsts
);
171 // Unswizzle from SWR-Z order
172 ppDsts32
[0][0] = pSrc32
[0]; // 0 1
173 ppDsts32
[0][1] = pSrc32
[2]; // 4 5
175 ppDsts32
[1][0] = pSrc32
[1]; // 2 3
176 ppDsts32
[1][1] = pSrc32
[3]; // 6 7
178 ppDsts32
[2][0] = pSrc32
[4]; // 8 9
179 ppDsts32
[2][1] = pSrc32
[6]; // C D
181 ppDsts32
[3][0] = pSrc32
[5]; // A B
182 ppDsts32
[3][1] = pSrc32
[7]; // E F
186 //////////////////////////////////////////////////////////////////////////
187 /// StorePixels (32-bit pixel specialization)
188 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
189 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
190 /// @param ppDsts - Array of destination pointers. Each pointer is
191 /// to a single row of at most 16B.
192 /// @tparam NumDests - Number of destination pointers. Each pair of
193 /// pointers is for a 16-byte column of two rows.
194 //////////////////////////////////////////////////////////////////////////
196 struct StorePixels
<32, 2>
198 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
200 // Each 4-pixel row is 16-bytes
201 simd4scalari
*pZRow01
= (simd4scalari
*)pSrc
;
202 simd4scalari vQuad00
= SIMD128::load_si(pZRow01
);
203 simd4scalari vQuad01
= SIMD128::load_si(pZRow01
+ 1);
205 simd4scalari vRow00
= SIMD128::unpacklo_epi64(vQuad00
, vQuad01
);
206 simd4scalari vRow10
= SIMD128::unpackhi_epi64(vQuad00
, vQuad01
);
208 SIMD128::storeu_si((simd4scalari
*)ppDsts
[0], vRow00
);
209 SIMD128::storeu_si((simd4scalari
*)ppDsts
[1], vRow10
);
214 struct StorePixels
<32, 4>
216 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
218 // 4 x 16 bytes = 64 bytes, 16 pixels
219 const simd4scalari
*pSrc128
= reinterpret_cast<const simd4scalari
*>(pSrc
);
221 simd4scalari
**ppDsts128
= reinterpret_cast<simd4scalari
**>(ppDsts
);
223 // Unswizzle from SWR-Z order
224 simd4scalari quad0
= SIMD128::load_si(&pSrc128
[0]); // 0 1 2 3
225 simd4scalari quad1
= SIMD128::load_si(&pSrc128
[1]); // 4 5 6 7
226 simd4scalari quad2
= SIMD128::load_si(&pSrc128
[2]); // 8 9 A B
227 simd4scalari quad3
= SIMD128::load_si(&pSrc128
[3]); // C D E F
229 SIMD128::storeu_si(ppDsts128
[0], SIMD128::unpacklo_epi64(quad0
, quad1
)); // 0 1 4 5
230 SIMD128::storeu_si(ppDsts128
[1], SIMD128::unpackhi_epi64(quad0
, quad1
)); // 2 3 6 7
231 SIMD128::storeu_si(ppDsts128
[2], SIMD128::unpacklo_epi64(quad2
, quad3
)); // 8 9 C D
232 SIMD128::storeu_si(ppDsts128
[3], SIMD128::unpackhi_epi64(quad2
, quad3
)); // A B E F
236 //////////////////////////////////////////////////////////////////////////
237 /// StorePixels (32-bit pixel specialization)
238 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
239 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
240 /// @param ppDsts - Array of destination pointers. Each pointer is
241 /// to a single row of at most 16B.
242 /// @tparam NumDests - Number of destination pointers. Each pair of
243 /// pointers is for a 16-byte column of two rows.
244 //////////////////////////////////////////////////////////////////////////
246 struct StorePixels
<64, 4>
248 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
250 // Each 4-pixel row is 32 bytes.
251 const simd4scalari
* pPixSrc
= (const simd4scalari
*)pSrc
;
253 // order of pointers match SWR-Z layout
254 simd4scalari
** pvDsts
= (simd4scalari
**)&ppDsts
[0];
255 *pvDsts
[0] = pPixSrc
[0];
256 *pvDsts
[1] = pPixSrc
[1];
257 *pvDsts
[2] = pPixSrc
[2];
258 *pvDsts
[3] = pPixSrc
[3];
263 struct StorePixels
<64, 8>
265 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[8])
267 // 8 x 16 bytes = 128 bytes, 16 pixels
268 const simd4scalari
*pSrc128
= reinterpret_cast<const simd4scalari
*>(pSrc
);
270 simd4scalari
**ppDsts128
= reinterpret_cast<simd4scalari
**>(ppDsts
);
272 // order of pointers match SWR-Z layout
273 *ppDsts128
[0] = pSrc128
[0]; // 0 1
274 *ppDsts128
[1] = pSrc128
[1]; // 2 3
275 *ppDsts128
[2] = pSrc128
[2]; // 4 5
276 *ppDsts128
[3] = pSrc128
[3]; // 6 7
277 *ppDsts128
[4] = pSrc128
[4]; // 8 9
278 *ppDsts128
[5] = pSrc128
[5]; // A B
279 *ppDsts128
[6] = pSrc128
[6]; // C D
280 *ppDsts128
[7] = pSrc128
[7]; // E F
284 //////////////////////////////////////////////////////////////////////////
285 /// StorePixels (32-bit pixel specialization)
286 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
287 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
288 /// @param ppDsts - Array of destination pointers. Each pointer is
289 /// to a single row of at most 16B.
290 /// @tparam NumDests - Number of destination pointers. Each pair of
291 /// pointers is for a 16-byte column of two rows.
292 //////////////////////////////////////////////////////////////////////////
294 struct StorePixels
<128, 8>
296 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[8])
298 // Each 4-pixel row is 64 bytes.
299 const simd4scalari
* pPixSrc
= (const simd4scalari
*)pSrc
;
301 // Unswizzle from SWR-Z order
302 simd4scalari
** pvDsts
= (simd4scalari
**)&ppDsts
[0];
303 *pvDsts
[0] = pPixSrc
[0];
304 *pvDsts
[1] = pPixSrc
[2];
305 *pvDsts
[2] = pPixSrc
[1];
306 *pvDsts
[3] = pPixSrc
[3];
307 *pvDsts
[4] = pPixSrc
[4];
308 *pvDsts
[5] = pPixSrc
[6];
309 *pvDsts
[6] = pPixSrc
[5];
310 *pvDsts
[7] = pPixSrc
[7];
315 struct StorePixels
<128, 16>
317 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[16])
319 // 16 x 16 bytes = 256 bytes, 16 pixels
320 const simd4scalari
*pSrc128
= reinterpret_cast<const simd4scalari
*>(pSrc
);
322 simd4scalari
**ppDsts128
= reinterpret_cast<simd4scalari
**>(ppDsts
);
324 for (uint32_t i
= 0; i
< 16; i
+= 4)
326 *ppDsts128
[i
+ 0] = pSrc128
[i
+ 0];
327 *ppDsts128
[i
+ 1] = pSrc128
[i
+ 2];
328 *ppDsts128
[i
+ 2] = pSrc128
[i
+ 1];
329 *ppDsts128
[i
+ 3] = pSrc128
[i
+ 3];
334 //////////////////////////////////////////////////////////////////////////
335 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
336 //////////////////////////////////////////////////////////////////////////
337 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
338 struct ConvertPixelsSOAtoAOS
340 //////////////////////////////////////////////////////////////////////////
341 /// @brief Converts a SIMD from the Hot Tile to the destination format
342 /// and converts from SOA to AOS.
343 /// @param pSrc - Pointer to raster tile.
344 /// @param pDst - Pointer to destination surface or deswizzling buffer.
345 template <size_t NumDests
>
346 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
348 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
350 OSALIGNSIMD16(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
351 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
353 // Convert from SrcFormat --> DstFormat
355 LoadSOA
<SrcFormat
>(pSrc
, src
);
356 StoreSOA
<DstFormat
>(src
, soaTile
);
358 // Convert from SOA --> AOS
359 FormatTraits
<DstFormat
>::TransposeT::Transpose_simd16(soaTile
, aosTile
);
361 // Store data into destination
362 StorePixels
<FormatTraits
<DstFormat
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
366 //////////////////////////////////////////////////////////////////////////
367 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
368 /// Specialization for no format conversion
369 //////////////////////////////////////////////////////////////////////////
370 template<SWR_FORMAT Format
>
371 struct ConvertPixelsSOAtoAOS
<Format
, Format
>
373 //////////////////////////////////////////////////////////////////////////
374 /// @brief Converts a SIMD from the Hot Tile to the destination format
375 /// and converts from SOA to AOS.
376 /// @param pSrc - Pointer to raster tile.
377 /// @param pDst - Pointer to destination surface or deswizzling buffer.
378 template <size_t NumDests
>
379 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
381 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
383 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
385 // Convert from SOA --> AOS
386 FormatTraits
<Format
>::TransposeT::Transpose_simd16(pSrc
, aosTile
);
388 // Store data into destination
389 StorePixels
<FormatTraits
<Format
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
393 //////////////////////////////////////////////////////////////////////////
394 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
395 //////////////////////////////////////////////////////////////////////////
397 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B5G6R5_UNORM
>
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Converts a SIMD from the Hot Tile to the destination format
401 /// and converts from SOA to AOS.
402 /// @param pSrc - Pointer to raster tile.
403 /// @param pDst - Pointer to destination surface or deswizzling buffer.
404 template <size_t NumDests
>
405 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
407 static const SWR_FORMAT SrcFormat
= R32G32B32A32_FLOAT
;
408 static const SWR_FORMAT DstFormat
= B5G6R5_UNORM
;
410 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
412 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
415 simd16vector src
, dst
;
416 LoadSOA
<SrcFormat
>(pSrc
, src
);
419 dst
.x
= src
[FormatTraits
<DstFormat
>::swizzle(0)];
420 dst
.y
= src
[FormatTraits
<DstFormat
>::swizzle(1)];
421 dst
.z
= src
[FormatTraits
<DstFormat
>::swizzle(2)];
424 dst
.x
= Clamp
<DstFormat
>(dst
.x
, 0);
425 dst
.y
= Clamp
<DstFormat
>(dst
.y
, 1);
426 dst
.z
= Clamp
<DstFormat
>(dst
.z
, 2);
429 dst
.x
= Normalize
<DstFormat
>(dst
.x
, 0);
430 dst
.y
= Normalize
<DstFormat
>(dst
.y
, 1);
431 dst
.z
= Normalize
<DstFormat
>(dst
.z
, 2);
434 simd16scalari packed
= _simd16_castps_si(dst
.x
);
436 SWR_ASSERT(FormatTraits
<DstFormat
>::GetBPC(0) == 5);
437 SWR_ASSERT(FormatTraits
<DstFormat
>::GetBPC(1) == 6);
439 packed
= _simd16_or_si(packed
, _simd16_slli_epi32(_simd16_castps_si(dst
.y
), 5));
440 packed
= _simd16_or_si(packed
, _simd16_slli_epi32(_simd16_castps_si(dst
.z
), 5 + 6));
442 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
443 uint32_t *pPacked
= (uint32_t*)&packed
;
444 uint16_t *pAosTile
= (uint16_t*)&aosTile
[0];
445 for (uint32_t t
= 0; t
< KNOB_SIMD16_WIDTH
; ++t
)
447 *pAosTile
++ = *pPacked
++;
450 // Store data into destination
451 StorePixels
<FormatTraits
<DstFormat
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
455 //////////////////////////////////////////////////////////////////////////
456 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
457 //////////////////////////////////////////////////////////////////////////
459 struct ConvertPixelsSOAtoAOS
<R32_FLOAT
, R24_UNORM_X8_TYPELESS
>
461 static const SWR_FORMAT SrcFormat
= R32_FLOAT
;
462 static const SWR_FORMAT DstFormat
= R24_UNORM_X8_TYPELESS
;
464 //////////////////////////////////////////////////////////////////////////
465 /// @brief Converts a SIMD from the Hot Tile to the destination format
466 /// and converts from SOA to AOS.
467 /// @param pSrc - Pointer to raster tile.
468 /// @param pDst - Pointer to destination surface or deswizzling buffer.
469 template <size_t NumDests
>
470 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
472 simd16scalar comp
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
475 const simd16scalar zero
= _simd16_setzero_ps();
476 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
478 comp
= _simd16_max_ps(comp
, zero
);
479 comp
= _simd16_min_ps(comp
, ones
);
482 comp
= _simd16_mul_ps(comp
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
484 simd16scalari temp
= _simd16_cvtps_epi32(comp
);
487 temp
= _simd16_permute_epi32(temp
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
489 // merge/store data into destination but don't overwrite the X8 bits
490 simdscalari destlo
= _simd_loadu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[1]), reinterpret_cast<simd4scalari
*>(ppDsts
[0]));
491 simdscalari desthi
= _simd_loadu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[3]), reinterpret_cast<simd4scalari
*>(ppDsts
[2]));
493 simd16scalari dest
= _simd16_setzero_si();
495 dest
= _simd16_insert_si(dest
, destlo
, 0);
496 dest
= _simd16_insert_si(dest
, desthi
, 1);
498 simd16scalari mask
= _simd16_set1_epi32(0x00FFFFFF);
500 dest
= _simd16_or_si(_simd16_andnot_si(mask
, dest
), _simd16_and_si(mask
, temp
));
502 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[1]), reinterpret_cast<simd4scalari
*>(ppDsts
[0]), _simd16_extract_si(dest
, 0));
503 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[3]), reinterpret_cast<simd4scalari
*>(ppDsts
[2]), _simd16_extract_si(dest
, 1));
507 template<SWR_FORMAT DstFormat
>
508 INLINE
static void FlatConvert(const uint8_t* pSrc
, uint8_t* pDst0
, uint8_t* pDst1
, uint8_t* pDst2
, uint8_t* pDst3
)
510 // swizzle rgba -> bgra while we load
511 simd16scalar comp0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(0) * sizeof(simd16scalar
))); // float32 rrrrrrrrrrrrrrrr
512 simd16scalar comp1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(1) * sizeof(simd16scalar
))); // float32 gggggggggggggggg
513 simd16scalar comp2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(2) * sizeof(simd16scalar
))); // float32 bbbbbbbbbbbbbbbb
514 simd16scalar comp3
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(3) * sizeof(simd16scalar
))); // float32 aaaaaaaaaaaaaaaa
517 const simd16scalar zero
= _simd16_setzero_ps();
518 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
520 comp0
= _simd16_max_ps(comp0
, zero
);
521 comp0
= _simd16_min_ps(comp0
, ones
);
523 comp1
= _simd16_max_ps(comp1
, zero
);
524 comp1
= _simd16_min_ps(comp1
, ones
);
526 comp2
= _simd16_max_ps(comp2
, zero
);
527 comp2
= _simd16_min_ps(comp2
, ones
);
529 comp3
= _simd16_max_ps(comp3
, zero
);
530 comp3
= _simd16_min_ps(comp3
, ones
);
532 // gamma-correct only rgb
533 if (FormatTraits
<DstFormat
>::isSRGB
)
535 comp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, comp0
);
536 comp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, comp1
);
537 comp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, comp2
);
540 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
541 comp0
= _simd16_mul_ps(comp0
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
542 comp1
= _simd16_mul_ps(comp1
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
543 comp2
= _simd16_mul_ps(comp2
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
544 comp3
= _simd16_mul_ps(comp3
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(3)));
546 // moving to 16 wide integer vector types
547 simd16scalari src0
= _simd16_cvtps_epi32(comp0
); // padded byte rrrrrrrrrrrrrrrr
548 simd16scalari src1
= _simd16_cvtps_epi32(comp1
); // padded byte gggggggggggggggg
549 simd16scalari src2
= _simd16_cvtps_epi32(comp2
); // padded byte bbbbbbbbbbbbbbbb
550 simd16scalari src3
= _simd16_cvtps_epi32(comp3
); // padded byte aaaaaaaaaaaaaaaa
552 // SOA to AOS conversion
553 src1
= _simd16_slli_epi32(src1
, 8);
554 src2
= _simd16_slli_epi32(src2
, 16);
555 src3
= _simd16_slli_epi32(src3
, 24);
557 simd16scalari final
= _simd16_or_si(_simd16_or_si(src0
, src1
), _simd16_or_si(src2
, src3
)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
559 // de-swizzle conversion
561 simd16scalari final0
= _simd16_permute2f128_si(final
, final
, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
562 simd16scalari final1
= _simd16_permute2f128_si(final
, final
, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
564 final
= _simd16_shuffle_epi64(final0
, final1
, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
567 final
= _simd16_permute_epi32(final
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
570 // store 8x2 memory order:
571 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
572 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
573 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst1
), reinterpret_cast<simd4scalari
*>(pDst0
), _simd16_extract_si(final
, 0));
574 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst3
), reinterpret_cast<simd4scalari
*>(pDst2
), _simd16_extract_si(final
, 1));
577 template<SWR_FORMAT DstFormat
>
578 INLINE
static void FlatConvert(const uint8_t* pSrc
, uint8_t* pDst
, uint8_t* pDst1
)
580 static const uint32_t offset
= sizeof(simdscalar
);
582 // swizzle rgba -> bgra while we load
583 simdscalar vComp0
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(0))*offset
)); // float32 rrrrrrrr
584 simdscalar vComp1
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(1))*offset
)); // float32 gggggggg
585 simdscalar vComp2
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(2))*offset
)); // float32 bbbbbbbb
586 simdscalar vComp3
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(3))*offset
)); // float32 aaaaaaaa
589 vComp0
= _simd_max_ps(vComp0
, _simd_setzero_ps());
590 vComp0
= _simd_min_ps(vComp0
, _simd_set1_ps(1.0f
));
592 vComp1
= _simd_max_ps(vComp1
, _simd_setzero_ps());
593 vComp1
= _simd_min_ps(vComp1
, _simd_set1_ps(1.0f
));
595 vComp2
= _simd_max_ps(vComp2
, _simd_setzero_ps());
596 vComp2
= _simd_min_ps(vComp2
, _simd_set1_ps(1.0f
));
598 vComp3
= _simd_max_ps(vComp3
, _simd_setzero_ps());
599 vComp3
= _simd_min_ps(vComp3
, _simd_set1_ps(1.0f
));
601 if (FormatTraits
<DstFormat
>::isSRGB
)
603 // Gamma-correct only rgb
604 vComp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, vComp0
);
605 vComp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, vComp1
);
606 vComp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, vComp2
);
609 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
610 vComp0
= _simd_mul_ps(vComp0
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
611 vComp1
= _simd_mul_ps(vComp1
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
612 vComp2
= _simd_mul_ps(vComp2
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
613 vComp3
= _simd_mul_ps(vComp3
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(3)));
615 // moving to 8 wide integer vector types
616 simdscalari src0
= _simd_cvtps_epi32(vComp0
); // padded byte rrrrrrrr
617 simdscalari src1
= _simd_cvtps_epi32(vComp1
); // padded byte gggggggg
618 simdscalari src2
= _simd_cvtps_epi32(vComp2
); // padded byte bbbbbbbb
619 simdscalari src3
= _simd_cvtps_epi32(vComp3
); // padded byte aaaaaaaa
621 #if KNOB_ARCH <= KNOB_ARCH_AVX
623 // splitting into two sets of 4 wide integer vector types
624 // because AVX doesn't have instructions to support this operation at 8 wide
625 simd4scalari srcLo0
= _mm256_castsi256_si128(src0
); // 000r000r000r000r
626 simd4scalari srcLo1
= _mm256_castsi256_si128(src1
); // 000g000g000g000g
627 simd4scalari srcLo2
= _mm256_castsi256_si128(src2
); // 000b000b000b000b
628 simd4scalari srcLo3
= _mm256_castsi256_si128(src3
); // 000a000a000a000a
630 simd4scalari srcHi0
= _mm256_extractf128_si256(src0
, 1); // 000r000r000r000r
631 simd4scalari srcHi1
= _mm256_extractf128_si256(src1
, 1); // 000g000g000g000g
632 simd4scalari srcHi2
= _mm256_extractf128_si256(src2
, 1); // 000b000b000b000b
633 simd4scalari srcHi3
= _mm256_extractf128_si256(src3
, 1); // 000a000a000a000a
635 srcLo1
= _mm_slli_si128(srcLo1
, 1); // 00g000g000g000g0
636 srcHi1
= _mm_slli_si128(srcHi1
, 1); // 00g000g000g000g0
637 srcLo2
= _mm_slli_si128(srcLo2
, 2); // 0b000b000b000b00
638 srcHi2
= _mm_slli_si128(srcHi2
, 2); // 0b000b000b000b00
639 srcLo3
= _mm_slli_si128(srcLo3
, 3); // a000a000a000a000
640 srcHi3
= _mm_slli_si128(srcHi3
, 3); // a000a000a000a000
642 srcLo0
= SIMD128::or_si(srcLo0
, srcLo1
); // 00gr00gr00gr00gr
643 srcLo2
= SIMD128::or_si(srcLo2
, srcLo3
); // ab00ab00ab00ab00
645 srcHi0
= SIMD128::or_si(srcHi0
, srcHi1
); // 00gr00gr00gr00gr
646 srcHi2
= SIMD128::or_si(srcHi2
, srcHi3
); // ab00ab00ab00ab00
648 srcLo0
= SIMD128::or_si(srcLo0
, srcLo2
); // abgrabgrabgrabgr
649 srcHi0
= SIMD128::or_si(srcHi0
, srcHi2
); // abgrabgrabgrabgr
651 // unpack into rows that get the tiling order correct
652 simd4scalari vRow00
= SIMD128::unpacklo_epi64(srcLo0
, srcHi0
); // abgrabgrabgrabgrabgrabgrabgrabgr
653 simd4scalari vRow10
= SIMD128::unpackhi_epi64(srcLo0
, srcHi0
);
655 simdscalari final
= _mm256_castsi128_si256(vRow00
);
656 final
= _mm256_insertf128_si256(final
, vRow10
, 1);
660 // logic is as above, only wider
661 src1
= _mm256_slli_si256(src1
, 1);
662 src2
= _mm256_slli_si256(src2
, 2);
663 src3
= _mm256_slli_si256(src3
, 3);
665 src0
= _mm256_or_si256(src0
, src1
);
666 src2
= _mm256_or_si256(src2
, src3
);
668 simdscalari final
= _mm256_or_si256(src0
, src2
);
670 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
671 final
= _mm256_permute4x64_epi64(final
, 0xD8);
674 _simd_storeu2_si((simd4scalari
*)pDst1
, (simd4scalari
*)pDst
, final
);
677 template<SWR_FORMAT DstFormat
>
678 INLINE
static void FlatConvertNoAlpha(const uint8_t* pSrc
, uint8_t* pDst0
, uint8_t* pDst1
, uint8_t* pDst2
, uint8_t* pDst3
)
680 // swizzle rgba -> bgra while we load
681 simd16scalar comp0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(0) * sizeof(simd16scalar
))); // float32 rrrrrrrrrrrrrrrr
682 simd16scalar comp1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(1) * sizeof(simd16scalar
))); // float32 gggggggggggggggg
683 simd16scalar comp2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(2) * sizeof(simd16scalar
))); // float32 bbbbbbbbbbbbbbbb
686 const simd16scalar zero
= _simd16_setzero_ps();
687 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
689 comp0
= _simd16_max_ps(comp0
, zero
);
690 comp0
= _simd16_min_ps(comp0
, ones
);
692 comp1
= _simd16_max_ps(comp1
, zero
);
693 comp1
= _simd16_min_ps(comp1
, ones
);
695 comp2
= _simd16_max_ps(comp2
, zero
);
696 comp2
= _simd16_min_ps(comp2
, ones
);
698 // gamma-correct only rgb
699 if (FormatTraits
<DstFormat
>::isSRGB
)
701 comp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, comp0
);
702 comp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, comp1
);
703 comp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, comp2
);
706 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
707 comp0
= _simd16_mul_ps(comp0
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
708 comp1
= _simd16_mul_ps(comp1
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
709 comp2
= _simd16_mul_ps(comp2
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
711 // moving to 16 wide integer vector types
712 simd16scalari src0
= _simd16_cvtps_epi32(comp0
); // padded byte rrrrrrrrrrrrrrrr
713 simd16scalari src1
= _simd16_cvtps_epi32(comp1
); // padded byte gggggggggggggggg
714 simd16scalari src2
= _simd16_cvtps_epi32(comp2
); // padded byte bbbbbbbbbbbbbbbb
716 // SOA to AOS conversion
717 src1
= _simd16_slli_epi32(src1
, 8);
718 src2
= _simd16_slli_epi32(src2
, 16);
720 simd16scalari final
= _simd16_or_si(_simd16_or_si(src0
, src1
), src2
); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
722 // de-swizzle conversion
724 simd16scalari final0
= _simd16_permute2f128_si(final
, final
, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
725 simd16scalari final1
= _simd16_permute2f128_si(final
, final
, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
727 final
= _simd16_shuffle_epi64(final0
, final1
, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
730 final
= _simd16_permute_epi32(final
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
733 // store 8x2 memory order:
734 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
735 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
736 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst1
), reinterpret_cast<simd4scalari
*>(pDst0
), _simd16_extract_si(final
, 0));
737 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst3
), reinterpret_cast<simd4scalari
*>(pDst2
), _simd16_extract_si(final
, 1));
740 template<SWR_FORMAT DstFormat
>
741 INLINE
static void FlatConvertNoAlpha(const uint8_t* pSrc
, uint8_t* pDst
, uint8_t* pDst1
)
743 static const uint32_t offset
= sizeof(simdscalar
);
745 // swizzle rgba -> bgra while we load
746 simdscalar vComp0
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(0))*offset
)); // float32 rrrrrrrr
747 simdscalar vComp1
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(1))*offset
)); // float32 gggggggg
748 simdscalar vComp2
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(2))*offset
)); // float32 bbbbbbbb
750 vComp0
= _simd_max_ps(vComp0
, _simd_setzero_ps());
751 vComp0
= _simd_min_ps(vComp0
, _simd_set1_ps(1.0f
));
753 vComp1
= _simd_max_ps(vComp1
, _simd_setzero_ps());
754 vComp1
= _simd_min_ps(vComp1
, _simd_set1_ps(1.0f
));
756 vComp2
= _simd_max_ps(vComp2
, _simd_setzero_ps());
757 vComp2
= _simd_min_ps(vComp2
, _simd_set1_ps(1.0f
));
759 if (FormatTraits
<DstFormat
>::isSRGB
)
761 // Gamma-correct only rgb
762 vComp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, vComp0
);
763 vComp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, vComp1
);
764 vComp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, vComp2
);
767 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
768 vComp0
= _simd_mul_ps(vComp0
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
769 vComp1
= _simd_mul_ps(vComp1
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
770 vComp2
= _simd_mul_ps(vComp2
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
772 // moving to 8 wide integer vector types
773 simdscalari src0
= _simd_cvtps_epi32(vComp0
); // padded byte rrrrrrrr
774 simdscalari src1
= _simd_cvtps_epi32(vComp1
); // padded byte gggggggg
775 simdscalari src2
= _simd_cvtps_epi32(vComp2
); // padded byte bbbbbbbb
777 #if KNOB_ARCH <= KNOB_ARCH_AVX
779 // splitting into two sets of 4 wide integer vector types
780 // because AVX doesn't have instructions to support this operation at 8 wide
781 simd4scalari srcLo0
= _mm256_castsi256_si128(src0
); // 000r000r000r000r
782 simd4scalari srcLo1
= _mm256_castsi256_si128(src1
); // 000g000g000g000g
783 simd4scalari srcLo2
= _mm256_castsi256_si128(src2
); // 000b000b000b000b
785 simd4scalari srcHi0
= _mm256_extractf128_si256(src0
, 1); // 000r000r000r000r
786 simd4scalari srcHi1
= _mm256_extractf128_si256(src1
, 1); // 000g000g000g000g
787 simd4scalari srcHi2
= _mm256_extractf128_si256(src2
, 1); // 000b000b000b000b
789 srcLo1
= _mm_slli_si128(srcLo1
, 1); // 00g000g000g000g0
790 srcHi1
= _mm_slli_si128(srcHi1
, 1); // 00g000g000g000g0
791 srcLo2
= _mm_slli_si128(srcLo2
, 2); // 0b000b000b000b00
792 srcHi2
= _mm_slli_si128(srcHi2
, 2); // 0b000b000b000b00
794 srcLo0
= SIMD128::or_si(srcLo0
, srcLo1
); // 00gr00gr00gr00gr
796 srcHi0
= SIMD128::or_si(srcHi0
, srcHi1
); // 00gr00gr00gr00gr
798 srcLo0
= SIMD128::or_si(srcLo0
, srcLo2
); // 0bgr0bgr0bgr0bgr
799 srcHi0
= SIMD128::or_si(srcHi0
, srcHi2
); // 0bgr0bgr0bgr0bgr
801 // unpack into rows that get the tiling order correct
802 simd4scalari vRow00
= SIMD128::unpacklo_epi64(srcLo0
, srcHi0
); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
803 simd4scalari vRow10
= SIMD128::unpackhi_epi64(srcLo0
, srcHi0
);
805 simdscalari final
= _mm256_castsi128_si256(vRow00
);
806 final
= _mm256_insertf128_si256(final
, vRow10
, 1);
810 // logic is as above, only wider
811 src1
= _mm256_slli_si256(src1
, 1);
812 src2
= _mm256_slli_si256(src2
, 2);
814 src0
= _mm256_or_si256(src0
, src1
);
816 simdscalari final
= _mm256_or_si256(src0
, src2
);
818 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
819 final
= _mm256_permute4x64_epi64(final
, 0xD8);
823 _simd_storeu2_si((simd4scalari
*)pDst1
, (simd4scalari
*)pDst
, final
);
827 struct ConvertPixelsSOAtoAOS
<R32G32B32A32_FLOAT
, B8G8R8A8_UNORM
>
829 template <size_t NumDests
>
830 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
832 FlatConvert
<B8G8R8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
837 struct ConvertPixelsSOAtoAOS
<R32G32B32A32_FLOAT
, B8G8R8X8_UNORM
>
839 template <size_t NumDests
>
840 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
842 FlatConvertNoAlpha
<B8G8R8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
847 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B8G8R8A8_UNORM_SRGB
>
849 template <size_t NumDests
>
850 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
852 FlatConvert
<B8G8R8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
857 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B8G8R8X8_UNORM_SRGB
>
859 template <size_t NumDests
>
860 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
862 FlatConvertNoAlpha
<B8G8R8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
867 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8A8_UNORM
>
869 template <size_t NumDests
>
870 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
872 FlatConvert
<R8G8B8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
877 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8X8_UNORM
>
879 template <size_t NumDests
>
880 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
882 FlatConvertNoAlpha
<R8G8B8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
887 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8A8_UNORM_SRGB
>
889 template <size_t NumDests
>
890 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
892 FlatConvert
<R8G8B8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
897 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8X8_UNORM_SRGB
>
899 template <size_t NumDests
>
900 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
902 FlatConvertNoAlpha
<R8G8B8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
906 //////////////////////////////////////////////////////////////////////////
908 //////////////////////////////////////////////////////////////////////////
909 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
910 struct StoreRasterTile
912 //////////////////////////////////////////////////////////////////////////
913 /// @brief Retrieve color from hot tile source which is always float.
914 /// @param pSrc - Pointer to raster tile.
915 /// @param x, y - Coordinates to raster tile.
916 /// @param output - output color
917 INLINE
static void GetSwizzledSrcColor(
919 uint32_t x
, uint32_t y
,
920 float outputColor
[4])
922 typedef SimdTile_16
<SrcFormat
, DstFormat
> SimdT
;
924 SimdT
*pSrcSimdTiles
= reinterpret_cast<SimdT
*>(pSrc
);
926 // Compute which simd tile we're accessing within 8x8 tile.
927 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
928 uint32_t simdIndex
= (y
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
) + (x
/ SIMD16_TILE_X_DIM
);
930 SimdT
*pSimdTile
= &pSrcSimdTiles
[simdIndex
];
932 uint32_t simdOffset
= (y
% SIMD16_TILE_Y_DIM
) * SIMD16_TILE_X_DIM
+ (x
% SIMD16_TILE_X_DIM
);
934 pSimdTile
->GetSwizzledColor(simdOffset
, outputColor
);
937 //////////////////////////////////////////////////////////////////////////
938 /// @brief Stores an 8x8 raster tile to the destination surface.
939 /// @param pSrc - Pointer to raster tile.
940 /// @param pDstSurface - Destination surface state
941 /// @param x, y - Coordinates to raster tile.
942 INLINE
static void Store(
944 SWR_SURFACE_STATE
* pDstSurface
,
945 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
) // (x, y) pixel coordinate to start of raster tile.
947 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
948 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
950 // For each raster tile pixel (rx, ry)
951 for (uint32_t ry
= 0; ry
< KNOB_TILE_Y_DIM
; ++ry
)
953 for (uint32_t rx
= 0; rx
< KNOB_TILE_X_DIM
; ++rx
)
955 // Perform bounds checking.
956 if (((x
+ rx
) < lodWidth
) &&
957 ((y
+ ry
) < lodHeight
))
960 GetSwizzledSrcColor(pSrc
, rx
, ry
, srcColor
);
962 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>((x
+ rx
), (y
+ ry
),
963 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
964 sampleNum
, pDstSurface
->lod
, pDstSurface
);
966 ConvertPixelFromFloat
<DstFormat
>(pDst
, srcColor
);
973 //////////////////////////////////////////////////////////////////////////
974 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
975 /// @param pSrc - Pointer to raster tile.
976 /// @param pDstSurface - Destination surface state
977 /// @param x, y - Coordinates to raster tile.
978 /// @param sampleOffset - Offset between adjacent multisamples
979 INLINE
static void Resolve(
981 SWR_SURFACE_STATE
* pDstSurface
,
982 uint32_t x
, uint32_t y
, uint32_t sampleOffset
, uint32_t renderTargetArrayIndex
) // (x, y) pixel coordinate to start of raster tile.
984 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
985 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
987 float oneOverNumSamples
= 1.0f
/ pDstSurface
->numSamples
;
989 // For each raster tile pixel (rx, ry)
990 for (uint32_t ry
= 0; ry
< KNOB_TILE_Y_DIM
; ++ry
)
992 for (uint32_t rx
= 0; rx
< KNOB_TILE_X_DIM
; ++rx
)
994 // Perform bounds checking.
995 if (((x
+ rx
) < lodWidth
) &&
996 ((y
+ ry
) < lodHeight
))
998 // Sum across samples
999 float resolveColor
[4] = {0};
1000 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
1002 float sampleColor
[4] = {0};
1003 uint8_t *pSampleSrc
= pSrc
+ sampleOffset
* sampleNum
;
1004 GetSwizzledSrcColor(pSampleSrc
, rx
, ry
, sampleColor
);
1005 resolveColor
[0] += sampleColor
[0];
1006 resolveColor
[1] += sampleColor
[1];
1007 resolveColor
[2] += sampleColor
[2];
1008 resolveColor
[3] += sampleColor
[3];
1011 // Divide by numSamples to average
1012 resolveColor
[0] *= oneOverNumSamples
;
1013 resolveColor
[1] *= oneOverNumSamples
;
1014 resolveColor
[2] *= oneOverNumSamples
;
1015 resolveColor
[3] *= oneOverNumSamples
;
1017 // Use the resolve surface state
1018 SWR_SURFACE_STATE
* pResolveSurface
= (SWR_SURFACE_STATE
*)pDstSurface
->xpAuxBaseAddress
;
1019 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>((x
+ rx
), (y
+ ry
),
1020 pResolveSurface
->arrayIndex
+ renderTargetArrayIndex
, pResolveSurface
->arrayIndex
+ renderTargetArrayIndex
,
1021 0, pResolveSurface
->lod
, pResolveSurface
);
1023 ConvertPixelFromFloat
<DstFormat
>(pDst
, resolveColor
);
1032 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1033 struct OptStoreRasterTile
: StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>
1036 //////////////////////////////////////////////////////////////////////////
1037 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1038 //////////////////////////////////////////////////////////////////////////
1039 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1040 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 8>, SrcFormat
, DstFormat
>
1042 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 8>, SrcFormat
, DstFormat
> GenericStoreTile
;
1043 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1044 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1046 //////////////////////////////////////////////////////////////////////////
1047 /// @brief Stores an 8x8 raster tile to the destination surface.
1048 /// @param pSrc - Pointer to raster tile.
1049 /// @param pDstSurface - Destination surface state
1050 /// @param x, y - Coordinates to raster tile.
1051 INLINE
static void Store(
1053 SWR_SURFACE_STATE
* pDstSurface
,
1054 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1056 // Punt non-full tiles to generic store
1057 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1058 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1060 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1062 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1065 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1066 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1068 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1069 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1073 pDst
, // row 0, col 0
1074 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1075 pDst
+ dx
/ 2, // row 0, col 1
1076 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1079 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1081 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1083 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1085 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1101 //////////////////////////////////////////////////////////////////////////
1102 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1103 //////////////////////////////////////////////////////////////////////////
1104 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1105 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 16>, SrcFormat
, DstFormat
>
1107 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 16>, SrcFormat
, DstFormat
> GenericStoreTile
;
1108 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1109 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1111 //////////////////////////////////////////////////////////////////////////
1112 /// @brief Stores an 8x8 raster tile to the destination surface.
1113 /// @param pSrc - Pointer to raster tile.
1114 /// @param pDstSurface - Destination surface state
1115 /// @param x, y - Coordinates to raster tile.
1116 INLINE
static void Store(
1118 SWR_SURFACE_STATE
* pDstSurface
,
1119 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1121 // Punt non-full tiles to generic store
1122 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1123 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1125 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1127 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1130 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1131 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1133 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1134 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1138 pDst
, // row 0, col 0
1139 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1140 pDst
+ dx
/ 2, // row 0, col 1
1141 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1144 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1146 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1148 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1150 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1166 //////////////////////////////////////////////////////////////////////////
1167 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1168 //////////////////////////////////////////////////////////////////////////
1169 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1170 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 32>, SrcFormat
, DstFormat
>
1172 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1173 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1174 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1176 //////////////////////////////////////////////////////////////////////////
1177 /// @brief Stores an 8x8 raster tile to the destination surface.
1178 /// @param pSrc - Pointer to raster tile.
1179 /// @param pDstSurface - Destination surface state
1180 /// @param x, y - Coordinates to raster tile.
1181 INLINE
static void Store(
1183 SWR_SURFACE_STATE
* pDstSurface
,
1184 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1186 // Punt non-full tiles to generic store
1187 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1188 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1190 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1192 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1195 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1196 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1198 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1199 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1203 pDst
, // row 0, col 0
1204 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1205 pDst
+ dx
/ 2, // row 0, col 1
1206 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1209 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1211 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1213 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1215 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1231 //////////////////////////////////////////////////////////////////////////
1232 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1233 //////////////////////////////////////////////////////////////////////////
1234 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1235 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 64>, SrcFormat
, DstFormat
>
1237 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 64>, SrcFormat
, DstFormat
> GenericStoreTile
;
1238 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1239 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1240 static const size_t MAX_DST_COLUMN_BYTES
= 16;
1242 //////////////////////////////////////////////////////////////////////////
1243 /// @brief Stores an 8x8 raster tile to the destination surface.
1244 /// @param pSrc - Pointer to raster tile.
1245 /// @param pDstSurface - Destination surface state
1246 /// @param x, y - Coordinates to raster tile.
1247 INLINE
static void Store(
1249 SWR_SURFACE_STATE
* pDstSurface
,
1250 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1252 // Punt non-full tiles to generic store
1253 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1254 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1256 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1258 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1261 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1262 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1264 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1265 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
;
1267 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1268 static_assert(dx
== MAX_DST_COLUMN_BYTES
* 4, "Invalid column offsets");
1272 pDst
, // row 0, col 0
1273 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1274 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1275 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1276 pDst
+ MAX_DST_COLUMN_BYTES
* 2, // row 0, col 2
1277 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 2, // row 1, col 2
1278 pDst
+ MAX_DST_COLUMN_BYTES
* 3, // row 0, col 3
1279 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 3 // row 1, col 3
1282 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1284 // Raster tile width is same as simd16 tile width
1285 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1287 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1289 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1291 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
1299 //////////////////////////////////////////////////////////////////////////
1300 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1301 //////////////////////////////////////////////////////////////////////////
1302 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1303 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 128>, SrcFormat
, DstFormat
>
1305 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 128>, SrcFormat
, DstFormat
> GenericStoreTile
;
1306 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1307 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1308 static const size_t MAX_DST_COLUMN_BYTES
= 16;
1310 //////////////////////////////////////////////////////////////////////////
1311 /// @brief Stores an 8x8 raster tile to the destination surface.
1312 /// @param pSrc - Pointer to raster tile.
1313 /// @param pDstSurface - Destination surface state
1314 /// @param x, y - Coordinates to raster tile.
1315 INLINE
static void Store(
1317 SWR_SURFACE_STATE
* pDstSurface
,
1318 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1320 // Punt non-full tiles to generic store
1321 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1322 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1324 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1326 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1329 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1330 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1332 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1333 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
;
1335 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1336 static_assert(dx
== MAX_DST_COLUMN_BYTES
* 8, "Invalid column offsets");
1340 pDst
, // row 0, col 0
1341 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1342 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1343 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1344 pDst
+ MAX_DST_COLUMN_BYTES
* 2, // row 0, col 2
1345 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 2, // row 1, col 2
1346 pDst
+ MAX_DST_COLUMN_BYTES
* 3, // row 0, col 3
1347 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 3, // row 1, col 3
1348 pDst
+ MAX_DST_COLUMN_BYTES
* 4, // row 0, col 4
1349 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 4, // row 1, col 4
1350 pDst
+ MAX_DST_COLUMN_BYTES
* 5, // row 0, col 5
1351 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 5, // row 1, col 5
1352 pDst
+ MAX_DST_COLUMN_BYTES
* 6, // row 0, col 6
1353 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 6, // row 1, col 6
1354 pDst
+ MAX_DST_COLUMN_BYTES
* 7, // row 0, col 7
1355 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 7, // row 1, col 7
1358 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1360 // Raster tile width is same as simd16 tile width
1361 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1363 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1365 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1367 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
1375 //////////////////////////////////////////////////////////////////////////
1376 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1377 //////////////////////////////////////////////////////////////////////////
1378 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1379 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 8>, SrcFormat
, DstFormat
>
1381 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 8>, SrcFormat
, DstFormat
> GenericStoreTile
;
1382 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1384 //////////////////////////////////////////////////////////////////////////
1385 /// @brief Stores an 8x8 raster tile to the destination surface.
1386 /// @param pSrc - Pointer to raster tile.
1387 /// @param pDstSurface - Destination surface state
1388 /// @param x, y - Coordinates to raster tile.
1389 INLINE
static void Store(
1391 SWR_SURFACE_STATE
* pDstSurface
,
1392 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1394 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1396 // Punt non-full tiles to generic store
1397 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1398 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1400 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1402 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1405 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1406 // We can compute the offsets to each column within the raster tile once and increment from these.
1407 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1408 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1409 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1411 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1413 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1417 pDst
+ DestRowWidthBytes
,
1418 pDst
+ DestRowWidthBytes
/ 4,
1419 pDst
+ DestRowWidthBytes
+ DestRowWidthBytes
/ 4
1422 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1424 // Raster tile width is same as simd16 tile width
1425 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1427 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1429 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1439 //////////////////////////////////////////////////////////////////////////
1440 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1441 //////////////////////////////////////////////////////////////////////////
1442 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1443 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 16>, SrcFormat
, DstFormat
>
1445 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 16>, SrcFormat
, DstFormat
> GenericStoreTile
;
1446 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1448 //////////////////////////////////////////////////////////////////////////
1449 /// @brief Stores an 8x8 raster tile to the destination surface.
1450 /// @param pSrc - Pointer to raster tile.
1451 /// @param pDstSurface - Destination surface state
1452 /// @param x, y - Coordinates to raster tile.
1453 INLINE
static void Store(
1455 SWR_SURFACE_STATE
* pDstSurface
,
1456 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1458 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1460 // Punt non-full tiles to generic store
1461 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1462 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1464 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1466 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1469 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1470 // We can compute the offsets to each column within the raster tile once and increment from these.
1471 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1472 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1473 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1475 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1477 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1481 pDst
+ DestRowWidthBytes
,
1482 pDst
+ DestRowWidthBytes
/ 2,
1483 pDst
+ DestRowWidthBytes
+ DestRowWidthBytes
/ 2
1486 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1488 // Raster tile width is same as simd16 tile width
1489 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1491 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1493 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1503 //////////////////////////////////////////////////////////////////////////
1504 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1505 //////////////////////////////////////////////////////////////////////////
1506 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1507 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_XMAJOR
, 32>, SrcFormat
, DstFormat
>
1509 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_XMAJOR
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1510 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1511 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1513 //////////////////////////////////////////////////////////////////////////
1514 /// @brief Stores an 8x8 raster tile to the destination surface.
1515 /// @param pSrc - Pointer to raster tile.
1516 /// @param pDstSurface - Destination surface state
1517 /// @param x, y - Coordinates to raster tile.
1518 INLINE
static void Store(
1520 SWR_SURFACE_STATE
* pDstSurface
,
1521 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1523 static const uint32_t DestRowWidthBytes
= 512; // 512B rows
1525 // Punt non-full tiles to generic store
1526 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1527 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1529 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1531 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1534 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1535 // We can compute the offsets to each column within the raster tile once and increment from these.
1536 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1537 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1539 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1540 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1544 pDst
, // row 0, col 0
1545 pDst
+ DestRowWidthBytes
, // row 1, col 0
1546 pDst
+ dx
/ 2, // row 0, col 1
1547 pDst
+ DestRowWidthBytes
+ dx
/ 2 // row 1, col 1
1550 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1552 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1554 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1556 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1572 //////////////////////////////////////////////////////////////////////////
1573 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1574 //////////////////////////////////////////////////////////////////////////
1575 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1576 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 32>, SrcFormat
, DstFormat
>
1578 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1579 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1581 //////////////////////////////////////////////////////////////////////////
1582 /// @brief Stores an 8x8 raster tile to the destination surface.
1583 /// @param pSrc - Pointer to raster tile.
1584 /// @param pDstSurface - Destination surface state
1585 /// @param x, y - Coordinates to raster tile.
1586 INLINE
static void Store(
1588 SWR_SURFACE_STATE
* pDstSurface
,
1589 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1591 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1592 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
1594 // Punt non-full tiles to generic store
1595 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1596 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1598 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1600 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1603 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1604 // We can compute the offsets to each column within the raster tile once and increment from these.
1605 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1606 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1607 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1609 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1610 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1612 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1615 pDst
, // row 0, col 0
1616 pDst
+ DestRowWidthBytes
, // row 1, col 0
1617 pDst
+ DestColumnBytes
, // row 0, col 1
1618 pDst
+ DestRowWidthBytes
+ DestColumnBytes
// row 1, col 1
1621 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1623 // Raster tile width is same as simd16 tile width
1624 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1626 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1628 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1638 //////////////////////////////////////////////////////////////////////////
1639 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1640 //////////////////////////////////////////////////////////////////////////
1641 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1642 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 64>, SrcFormat
, DstFormat
>
1644 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 64>, SrcFormat
, DstFormat
> GenericStoreTile
;
1645 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1647 //////////////////////////////////////////////////////////////////////////
1648 /// @brief Stores an 8x8 raster tile to the destination surface.
1649 /// @param pSrc - Pointer to raster tile.
1650 /// @param pDstSurface - Destination surface state
1651 /// @param x, y - Coordinates to raster tile.
1652 INLINE
static void Store(
1654 SWR_SURFACE_STATE
* pDstSurface
,
1655 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1657 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1658 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
1660 // Punt non-full tiles to generic store
1661 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1662 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1664 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1666 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1669 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1670 // We can compute the offsets to each column within the raster tile once and increment from these.
1671 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1672 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1673 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1675 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1676 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1678 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1681 pDst
, // row 0, col 0
1682 pDst
+ DestRowWidthBytes
, // row 1, col 0
1683 pDst
+ DestColumnBytes
, // row 0, col 1
1684 pDst
+ DestRowWidthBytes
+ DestColumnBytes
, // row 1, col 1
1685 pDst
+ DestColumnBytes
* 2, // row 0, col 2
1686 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 2, // row 1, col 2
1687 pDst
+ DestColumnBytes
* 3, // row 0, col 3
1688 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 3 // row 1, col 3
1691 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1693 // Raster tile width is same as simd16 tile width
1694 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1696 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1698 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1700 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
1708 //////////////////////////////////////////////////////////////////////////
1709 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1710 //////////////////////////////////////////////////////////////////////////
1711 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1712 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 128>, SrcFormat
, DstFormat
>
1714 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 128>, SrcFormat
, DstFormat
> GenericStoreTile
;
1715 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1717 //////////////////////////////////////////////////////////////////////////
1718 /// @brief Stores an 8x8 raster tile to the destination surface.
1719 /// @param pSrc - Pointer to raster tile.
1720 /// @param pDstSurface - Destination surface state
1721 /// @param x, y - Coordinates to raster tile.
1722 INLINE
static void Store(
1724 SWR_SURFACE_STATE
* pDstSurface
,
1725 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1727 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1728 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
1730 // Punt non-full tiles to generic store
1731 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1732 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1734 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1736 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1739 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1740 // We can compute the offsets to each column within the raster tile once and increment from these.
1741 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1742 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1743 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1745 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1746 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1748 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1751 pDst
, // row 0, col 0
1752 pDst
+ DestRowWidthBytes
, // row 1, col 0
1753 pDst
+ DestColumnBytes
, // row 0, col 1
1754 pDst
+ DestRowWidthBytes
+ DestColumnBytes
, // row 1, col 1
1755 pDst
+ DestColumnBytes
* 2, // row 0, col 2
1756 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 2, // row 1, col 2
1757 pDst
+ DestColumnBytes
* 3, // row 0, col 3
1758 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 3, // row 1, col 3
1759 pDst
+ DestColumnBytes
* 4, // row 0, col 4
1760 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 4, // row 1, col 4
1761 pDst
+ DestColumnBytes
* 5, // row 0, col 5
1762 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 5, // row 1, col 5
1763 pDst
+ DestColumnBytes
* 6, // row 0, col 6
1764 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 6, // row 1, col 6
1765 pDst
+ DestColumnBytes
* 7, // row 0, col 7
1766 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 7 // row 1, col 7
1769 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1771 // Raster tile width is same as simd16 tile width
1772 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1774 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1776 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1778 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
1786 //////////////////////////////////////////////////////////////////////////
1787 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1788 //////////////////////////////////////////////////////////////////////////
1789 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1790 struct StoreMacroTile
1792 //////////////////////////////////////////////////////////////////////////
1793 /// @brief Stores a macrotile to the destination surface using safe implementation.
1794 /// @param pSrc - Pointer to macro tile.
1795 /// @param pDstSurface - Destination surface state
1796 /// @param x, y - Coordinates to macro tile
1797 static void StoreGeneric(
1798 uint8_t *pSrcHotTile
,
1799 SWR_SURFACE_STATE
* pDstSurface
,
1800 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
)
1802 PFN_STORE_TILES_INTERNAL pfnStore
;
1803 pfnStore
= StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
;
1805 // Store each raster tile from the hot tile to the destination surface.
1806 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
1808 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
1810 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
1812 pfnStore(pSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleNum
, renderTargetArrayIndex
);
1813 pSrcHotTile
+= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
1820 typedef void(*PFN_STORE_TILES_INTERNAL
)(uint8_t*, SWR_SURFACE_STATE
*, uint32_t, uint32_t, uint32_t, uint32_t);
1821 //////////////////////////////////////////////////////////////////////////
1822 /// @brief Stores a macrotile to the destination surface.
1823 /// @param pSrc - Pointer to macro tile.
1824 /// @param pDstSurface - Destination surface state
1825 /// @param x, y - Coordinates to macro tile
1827 uint8_t *pSrcHotTile
,
1828 SWR_SURFACE_STATE
* pDstSurface
,
1829 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
)
1831 PFN_STORE_TILES_INTERNAL pfnStore
[SWR_MAX_NUM_MULTISAMPLES
];
1833 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
1835 size_t dstSurfAddress
= (size_t)ComputeSurfaceAddress
<false, false>(
1838 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, // z for 3D surfaces
1839 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, // array index for 2D arrays
1844 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1845 bool bForceGeneric
= ((pDstSurface
->tileMode
!= SWR_TILE_NONE
) && (0 != (dstSurfAddress
& 0xfff))) ||
1846 (pDstSurface
->bInterleavedSamples
);
1848 pfnStore
[sampleNum
] = (bForceGeneric
|| KNOB_USE_GENERIC_STORETILE
) ? StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
: OptStoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
;
1851 // Save original for pSrcHotTile resolve.
1852 uint8_t *pResolveSrcHotTile
= pSrcHotTile
;
1854 // Store each raster tile from the hot tile to the destination surface.
1855 for(uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
1857 for(uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
1859 for(uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
1861 pfnStore
[sampleNum
](pSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleNum
, renderTargetArrayIndex
);
1862 pSrcHotTile
+= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
1867 if (pDstSurface
->xpAuxBaseAddress
)
1869 uint32_t sampleOffset
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
1870 // Store each raster tile from the hot tile to the destination surface.
1871 for(uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
1873 for(uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
1875 StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Resolve(pResolveSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleOffset
, renderTargetArrayIndex
);
1876 pResolveSrcHotTile
+= sampleOffset
* pDstSurface
->numSamples
;
1883 //////////////////////////////////////////////////////////////////////////
1884 /// InitStoreTilesTable - Helper for setting up the tables.
1885 template <SWR_TILE_MODE TTileMode
, size_t NumTileModesT
, size_t ArraySizeT
>
1886 void InitStoreTilesTableColor_Half1(
1887 PFN_STORE_TILES (&table
)[NumTileModesT
][ArraySizeT
])
1889 table
[TTileMode
][R32G32B32A32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_FLOAT
>::Store
;
1890 table
[TTileMode
][R32G32B32A32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_SINT
>::Store
;
1891 table
[TTileMode
][R32G32B32A32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_UINT
>::Store
;
1892 table
[TTileMode
][R32G32B32X32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32X32_FLOAT
>::Store
;
1893 table
[TTileMode
][R32G32B32A32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_SSCALED
>::Store
;
1894 table
[TTileMode
][R32G32B32A32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_USCALED
>::Store
;
1895 table
[TTileMode
][R32G32B32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_FLOAT
>::Store
;
1896 table
[TTileMode
][R32G32B32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_SINT
>::Store
;
1897 table
[TTileMode
][R32G32B32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_UINT
>::Store
;
1898 table
[TTileMode
][R32G32B32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_SSCALED
>::Store
;
1899 table
[TTileMode
][R32G32B32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_USCALED
>::Store
;
1900 table
[TTileMode
][R16G16B16A16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_UNORM
>::Store
;
1901 table
[TTileMode
][R16G16B16A16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SNORM
>::Store
;
1902 table
[TTileMode
][R16G16B16A16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SINT
>::Store
;
1903 table
[TTileMode
][R16G16B16A16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_UINT
>::Store
;
1904 table
[TTileMode
][R16G16B16A16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_FLOAT
>::Store
;
1905 table
[TTileMode
][R32G32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_FLOAT
>::Store
;
1906 table
[TTileMode
][R32G32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_SINT
>::Store
;
1907 table
[TTileMode
][R32G32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_UINT
>::Store
;
1908 table
[TTileMode
][R32_FLOAT_X8X24_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32_FLOAT_X8X24_TYPELESS
>::Store
;
1909 table
[TTileMode
][X32_TYPELESS_G8X24_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, X32_TYPELESS_G8X24_UINT
>::Store
;
1910 table
[TTileMode
][R16G16B16X16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16X16_UNORM
>::Store
;
1911 table
[TTileMode
][R16G16B16X16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16X16_FLOAT
>::Store
;
1912 table
[TTileMode
][R16G16B16A16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SSCALED
>::Store
;
1913 table
[TTileMode
][R16G16B16A16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_USCALED
>::Store
;
1914 table
[TTileMode
][R32G32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_SSCALED
>::Store
;
1915 table
[TTileMode
][R32G32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_USCALED
>::Store
;
1916 table
[TTileMode
][B8G8R8A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8A8_UNORM
>::Store
;
1917 table
[TTileMode
][B8G8R8A8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8A8_UNORM_SRGB
>::Store
;
1918 table
[TTileMode
][R10G10B10A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UNORM
>::StoreGeneric
;
1919 table
[TTileMode
][R10G10B10A2_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UNORM_SRGB
>::StoreGeneric
;
1920 table
[TTileMode
][R10G10B10A2_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UINT
>::StoreGeneric
;
1921 table
[TTileMode
][R8G8B8A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UNORM
>::Store
;
1922 table
[TTileMode
][R8G8B8A8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UNORM_SRGB
>::Store
;
1923 table
[TTileMode
][R8G8B8A8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SNORM
>::Store
;
1924 table
[TTileMode
][R8G8B8A8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SINT
>::Store
;
1925 table
[TTileMode
][R8G8B8A8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UINT
>::Store
;
1926 table
[TTileMode
][R16G16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_UNORM
>::Store
;
1927 table
[TTileMode
][R16G16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SNORM
>::Store
;
1928 table
[TTileMode
][R16G16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SINT
>::Store
;
1929 table
[TTileMode
][R16G16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_UINT
>::Store
;
1930 table
[TTileMode
][R16G16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_FLOAT
>::Store
;
1931 table
[TTileMode
][B10G10R10A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UNORM
>::StoreGeneric
;
1932 table
[TTileMode
][B10G10R10A2_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UNORM_SRGB
>::StoreGeneric
;
1933 table
[TTileMode
][R11G11B10_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R11G11B10_FLOAT
>::StoreGeneric
;
1934 table
[TTileMode
][R10G10B10_FLOAT_A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10_FLOAT_A2_UNORM
>::StoreGeneric
;
1935 table
[TTileMode
][R32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_SINT
>::Store
;
1936 table
[TTileMode
][R32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_UINT
>::Store
;
1937 table
[TTileMode
][R32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_FLOAT
>::Store
;
1938 table
[TTileMode
][R24_UNORM_X8_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R24_UNORM_X8_TYPELESS
>::StoreGeneric
;
1939 table
[TTileMode
][X24_TYPELESS_G8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, X24_TYPELESS_G8_UINT
>::StoreGeneric
;
1940 table
[TTileMode
][A32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, A32_FLOAT
>::Store
;
1941 table
[TTileMode
][B8G8R8X8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8X8_UNORM
>::Store
;
1942 table
[TTileMode
][B8G8R8X8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8X8_UNORM_SRGB
>::Store
;
1943 table
[TTileMode
][R8G8B8X8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8X8_UNORM
>::Store
;
1944 table
[TTileMode
][R8G8B8X8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8X8_UNORM_SRGB
>::Store
;
1947 template <SWR_TILE_MODE TTileMode
, size_t NumTileModesT
, size_t ArraySizeT
>
1948 void InitStoreTilesTableColor_Half2(
1949 PFN_STORE_TILES(&table
)[NumTileModesT
][ArraySizeT
])
1951 table
[TTileMode
][R9G9B9E5_SHAREDEXP
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R9G9B9E5_SHAREDEXP
>::StoreGeneric
;
1952 table
[TTileMode
][B10G10R10X2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10X2_UNORM
>::StoreGeneric
;
1953 table
[TTileMode
][R10G10B10X2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10X2_USCALED
>::StoreGeneric
;
1954 table
[TTileMode
][R8G8B8A8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SSCALED
>::Store
;
1955 table
[TTileMode
][R8G8B8A8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_USCALED
>::Store
;
1956 table
[TTileMode
][R16G16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SSCALED
>::Store
;
1957 table
[TTileMode
][R16G16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_USCALED
>::Store
;
1958 table
[TTileMode
][R32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_SSCALED
>::Store
;
1959 table
[TTileMode
][R32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_USCALED
>::Store
;
1960 table
[TTileMode
][B5G6R5_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G6R5_UNORM
>::Store
;
1961 table
[TTileMode
][B5G6R5_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G6R5_UNORM_SRGB
>::StoreGeneric
;
1962 table
[TTileMode
][B5G5R5A1_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5A1_UNORM
>::StoreGeneric
;
1963 table
[TTileMode
][B5G5R5A1_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5A1_UNORM_SRGB
>::StoreGeneric
;
1964 table
[TTileMode
][B4G4R4A4_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B4G4R4A4_UNORM
>::StoreGeneric
;
1965 table
[TTileMode
][B4G4R4A4_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B4G4R4A4_UNORM_SRGB
>::StoreGeneric
;
1966 table
[TTileMode
][R8G8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_UNORM
>::Store
;
1967 table
[TTileMode
][R8G8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SNORM
>::Store
;
1968 table
[TTileMode
][R8G8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SINT
>::Store
;
1969 table
[TTileMode
][R8G8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_UINT
>::Store
;
1970 table
[TTileMode
][R16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_UNORM
>::Store
;
1971 table
[TTileMode
][R16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SNORM
>::Store
;
1972 table
[TTileMode
][R16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SINT
>::Store
;
1973 table
[TTileMode
][R16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_UINT
>::Store
;
1974 table
[TTileMode
][R16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_FLOAT
>::Store
;
1975 table
[TTileMode
][A16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A16_UNORM
>::Store
;
1976 table
[TTileMode
][A16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A16_FLOAT
>::Store
;
1977 table
[TTileMode
][B5G5R5X1_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5X1_UNORM
>::StoreGeneric
;
1978 table
[TTileMode
][B5G5R5X1_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5X1_UNORM_SRGB
>::StoreGeneric
;
1979 table
[TTileMode
][R8G8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SSCALED
>::Store
;
1980 table
[TTileMode
][R8G8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_USCALED
>::Store
;
1981 table
[TTileMode
][R16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SSCALED
>::Store
;
1982 table
[TTileMode
][R16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_USCALED
>::Store
;
1983 table
[TTileMode
][A1B5G5R5_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A1B5G5R5_UNORM
>::StoreGeneric
;
1984 table
[TTileMode
][A4B4G4R4_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A4B4G4R4_UNORM
>::StoreGeneric
;
1985 table
[TTileMode
][R8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_UNORM
>::Store
;
1986 table
[TTileMode
][R8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SNORM
>::Store
;
1987 table
[TTileMode
][R8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SINT
>::Store
;
1988 table
[TTileMode
][R8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_UINT
>::Store
;
1989 table
[TTileMode
][A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, A8_UNORM
>::Store
;
1990 table
[TTileMode
][R8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SSCALED
>::Store
;
1991 table
[TTileMode
][R8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_USCALED
>::Store
;
1992 table
[TTileMode
][R8G8B8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UNORM
>::Store
;
1993 table
[TTileMode
][R8G8B8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SNORM
>::Store
;
1994 table
[TTileMode
][R8G8B8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SSCALED
>::Store
;
1995 table
[TTileMode
][R8G8B8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_USCALED
>::Store
;
1996 table
[TTileMode
][R16G16B16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_FLOAT
>::Store
;
1997 table
[TTileMode
][R16G16B16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_UNORM
>::Store
;
1998 table
[TTileMode
][R16G16B16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SNORM
>::Store
;
1999 table
[TTileMode
][R16G16B16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SSCALED
>::Store
;
2000 table
[TTileMode
][R16G16B16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_USCALED
>::Store
;
2001 table
[TTileMode
][R8G8B8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UNORM_SRGB
>::Store
;
2002 table
[TTileMode
][R16G16B16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_UINT
>::Store
;
2003 table
[TTileMode
][R16G16B16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SINT
>::Store
;
2004 table
[TTileMode
][R10G10B10A2_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SNORM
>::StoreGeneric
;
2005 table
[TTileMode
][R10G10B10A2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_USCALED
>::StoreGeneric
;
2006 table
[TTileMode
][R10G10B10A2_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SSCALED
>::StoreGeneric
;
2007 table
[TTileMode
][R10G10B10A2_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SINT
>::StoreGeneric
;
2008 table
[TTileMode
][B10G10R10A2_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SNORM
>::StoreGeneric
;
2009 table
[TTileMode
][B10G10R10A2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_USCALED
>::StoreGeneric
;
2010 table
[TTileMode
][B10G10R10A2_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SSCALED
>::StoreGeneric
;
2011 table
[TTileMode
][B10G10R10A2_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UINT
>::StoreGeneric
;
2012 table
[TTileMode
][B10G10R10A2_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SINT
>::StoreGeneric
;
2013 table
[TTileMode
][R8G8B8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UINT
>::Store
;
2014 table
[TTileMode
][R8G8B8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SINT
>::Store
;
2017 //////////////////////////////////////////////////////////////////////////
2018 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2019 template <SWR_TILE_MODE TTileMode
, size_t NumTileModes
, size_t ArraySizeT
>
2020 void InitStoreTilesTableDepth(
2021 PFN_STORE_TILES(&table
)[NumTileModes
][ArraySizeT
])
2023 table
[TTileMode
][R32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32_FLOAT
, R32_FLOAT
>::Store
;
2024 table
[TTileMode
][R32_FLOAT_X8X24_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32_FLOAT
, R32_FLOAT_X8X24_TYPELESS
>::Store
;
2025 table
[TTileMode
][R24_UNORM_X8_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32_FLOAT
, R24_UNORM_X8_TYPELESS
>::Store
;
2026 table
[TTileMode
][R16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32_FLOAT
, R16_UNORM
>::Store
;
2029 template <SWR_TILE_MODE TTileMode
, size_t NumTileModes
, size_t ArraySizeT
>
2030 void InitStoreTilesTableStencil(
2031 PFN_STORE_TILES(&table
)[NumTileModes
][ArraySizeT
])
2033 table
[TTileMode
][R8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R8_UINT
, R8_UINT
>::Store
;
2037 //////////////////////////////////////////////////////////////////////////
2038 /// @brief Deswizzles and stores a full hottile to a render surface
2039 /// @param hPrivateContext - Handle to private DC
2040 /// @param srcFormat - Format for hot tile.
2041 /// @param renderTargetIndex - Index to destination render target
2042 /// @param x, y - Coordinates to raster tile.
2043 /// @param pSrcHotTile - Pointer to Hot Tile
2044 void SwrStoreHotTileToSurface(
2045 HANDLE hWorkerPrivateData
,
2046 SWR_SURFACE_STATE
*pDstSurface
,
2047 SWR_FORMAT srcFormat
,
2048 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex
,
2049 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
,
2050 uint8_t *pSrcHotTile
);