1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Functionality for Store.
27 ******************************************************************************/
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
43 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
45 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
46 typedef void(*PFN_STORE_TILES
)(uint8_t*, SWR_SURFACE_STATE
*, uint32_t, uint32_t, uint32_t);
48 //////////////////////////////////////////////////////////////////////////
49 /// Store Raster Tile Function Tables.
50 //////////////////////////////////////////////////////////////////////////
51 extern PFN_STORE_TILES sStoreTilesTableColor
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
52 extern PFN_STORE_TILES sStoreTilesTableDepth
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
53 extern PFN_STORE_TILES sStoreTilesTableStencil
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
55 void InitStoreTilesTable_Linear_1();
56 void InitStoreTilesTable_Linear_2();
57 void InitStoreTilesTable_TileX_1();
58 void InitStoreTilesTable_TileX_2();
59 void InitStoreTilesTable_TileY_1();
60 void InitStoreTilesTable_TileY_2();
61 void InitStoreTilesTable_TileW();
62 void InitStoreTilesTable();
64 //////////////////////////////////////////////////////////////////////////
66 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
67 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
68 /// @param ppDsts - Array of destination pointers. Each pointer is
69 /// to a single row of at most 16B.
70 /// @tparam NumDests - Number of destination pointers. Each pair of
71 /// pointers is for a 16-byte column of two rows.
72 //////////////////////////////////////////////////////////////////////////
73 template <size_t PixelSize
, size_t NumDests
>
76 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
]) = delete;
79 //////////////////////////////////////////////////////////////////////////
80 /// StorePixels (32-bit pixel specialization)
81 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
82 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
83 /// @param ppDsts - Array of destination pointers. Each pointer is
84 /// to a single row of at most 16B.
85 /// @tparam NumDests - Number of destination pointers. Each pair of
86 /// pointers is for a 16-byte column of two rows.
87 //////////////////////////////////////////////////////////////////////////
89 struct StorePixels
<8, 2>
91 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
93 // Each 4-pixel row is 4 bytes.
94 const uint16_t* pPixSrc
= (const uint16_t*)pSrc
;
96 // Unswizzle from SWR-Z order
97 uint16_t* pRow
= (uint16_t*)ppDsts
[0];
101 pRow
= (uint16_t*)ppDsts
[1];
102 pRow
[0] = pPixSrc
[1];
103 pRow
[1] = pPixSrc
[3];
107 #if USE_8x2_TILE_BACKEND
109 struct StorePixels
<8, 4>
111 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
113 // 8 x 2 bytes = 16 bytes, 16 pixels
114 const uint16_t *pSrc16
= reinterpret_cast<const uint16_t *>(pSrc
);
116 uint16_t **ppDsts16
= reinterpret_cast<uint16_t **>(ppDsts
);
118 // Unswizzle from SWR-Z order
119 ppDsts16
[0][0] = pSrc16
[0]; // 0 1
120 ppDsts16
[0][1] = pSrc16
[2]; // 4 5
122 ppDsts16
[1][0] = pSrc16
[1]; // 2 3
123 ppDsts16
[1][1] = pSrc16
[3]; // 6 7
125 ppDsts16
[2][0] = pSrc16
[4]; // 8 9
126 ppDsts16
[2][1] = pSrc16
[6]; // C D
128 ppDsts16
[3][0] = pSrc16
[5]; // A B
129 ppDsts16
[3][1] = pSrc16
[7]; // E F
134 //////////////////////////////////////////////////////////////////////////
135 /// StorePixels (32-bit pixel specialization)
136 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
137 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
138 /// @param ppDsts - Array of destination pointers. Each pointer is
139 /// to a single row of at most 16B.
140 /// @tparam NumDests - Number of destination pointers. Each pair of
141 /// pointers is for a 16-byte column of two rows.
142 //////////////////////////////////////////////////////////////////////////
144 struct StorePixels
<16, 2>
146 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
148 // Each 4-pixel row is 8 bytes.
149 const uint32_t* pPixSrc
= (const uint32_t*)pSrc
;
151 // Unswizzle from SWR-Z order
152 uint32_t* pRow
= (uint32_t*)ppDsts
[0];
153 pRow
[0] = pPixSrc
[0];
154 pRow
[1] = pPixSrc
[2];
156 pRow
= (uint32_t*)ppDsts
[1];
157 pRow
[0] = pPixSrc
[1];
158 pRow
[1] = pPixSrc
[3];
162 #if USE_8x2_TILE_BACKEND
164 struct StorePixels
<16, 4>
166 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
168 // 8 x 4 bytes = 32 bytes, 16 pixels
169 const uint32_t *pSrc32
= reinterpret_cast<const uint32_t *>(pSrc
);
171 uint32_t **ppDsts32
= reinterpret_cast<uint32_t **>(ppDsts
);
173 // Unswizzle from SWR-Z order
174 ppDsts32
[0][0] = pSrc32
[0]; // 0 1
175 ppDsts32
[0][1] = pSrc32
[2]; // 4 5
177 ppDsts32
[1][0] = pSrc32
[1]; // 2 3
178 ppDsts32
[1][1] = pSrc32
[3]; // 6 7
180 ppDsts32
[2][0] = pSrc32
[4]; // 8 9
181 ppDsts32
[2][1] = pSrc32
[6]; // C D
183 ppDsts32
[3][0] = pSrc32
[5]; // A B
184 ppDsts32
[3][1] = pSrc32
[7]; // E F
189 //////////////////////////////////////////////////////////////////////////
190 /// StorePixels (32-bit pixel specialization)
191 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
192 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
193 /// @param ppDsts - Array of destination pointers. Each pointer is
194 /// to a single row of at most 16B.
195 /// @tparam NumDests - Number of destination pointers. Each pair of
196 /// pointers is for a 16-byte column of two rows.
197 //////////////////////////////////////////////////////////////////////////
199 struct StorePixels
<32, 2>
201 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
203 // Each 4-pixel row is 16-bytes
204 simd4scalari
*pZRow01
= (simd4scalari
*)pSrc
;
205 simd4scalari vQuad00
= SIMD128::load_si(pZRow01
);
206 simd4scalari vQuad01
= SIMD128::load_si(pZRow01
+ 1);
208 simd4scalari vRow00
= SIMD128::unpacklo_epi64(vQuad00
, vQuad01
);
209 simd4scalari vRow10
= SIMD128::unpackhi_epi64(vQuad00
, vQuad01
);
211 SIMD128::storeu_si((simd4scalari
*)ppDsts
[0], vRow00
);
212 SIMD128::storeu_si((simd4scalari
*)ppDsts
[1], vRow10
);
216 #if USE_8x2_TILE_BACKEND
218 struct StorePixels
<32, 4>
220 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
222 // 4 x 16 bytes = 64 bytes, 16 pixels
223 const simd4scalari
*pSrc128
= reinterpret_cast<const simd4scalari
*>(pSrc
);
225 simd4scalari
**ppDsts128
= reinterpret_cast<simd4scalari
**>(ppDsts
);
227 // Unswizzle from SWR-Z order
228 simd4scalari quad0
= SIMD128::load_si(&pSrc128
[0]); // 0 1 2 3
229 simd4scalari quad1
= SIMD128::load_si(&pSrc128
[1]); // 4 5 6 7
230 simd4scalari quad2
= SIMD128::load_si(&pSrc128
[2]); // 8 9 A B
231 simd4scalari quad3
= SIMD128::load_si(&pSrc128
[3]); // C D E F
233 SIMD128::storeu_si(ppDsts128
[0], SIMD128::unpacklo_epi64(quad0
, quad1
)); // 0 1 4 5
234 SIMD128::storeu_si(ppDsts128
[1], SIMD128::unpackhi_epi64(quad0
, quad1
)); // 2 3 6 7
235 SIMD128::storeu_si(ppDsts128
[2], SIMD128::unpacklo_epi64(quad2
, quad3
)); // 8 9 C D
236 SIMD128::storeu_si(ppDsts128
[3], SIMD128::unpackhi_epi64(quad2
, quad3
)); // A B E F
241 //////////////////////////////////////////////////////////////////////////
242 /// StorePixels (32-bit pixel specialization)
243 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
244 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
245 /// @param ppDsts - Array of destination pointers. Each pointer is
246 /// to a single row of at most 16B.
247 /// @tparam NumDests - Number of destination pointers. Each pair of
248 /// pointers is for a 16-byte column of two rows.
249 //////////////////////////////////////////////////////////////////////////
251 struct StorePixels
<64, 4>
253 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
255 // Each 4-pixel row is 32 bytes.
256 const simd4scalari
* pPixSrc
= (const simd4scalari
*)pSrc
;
258 // order of pointers match SWR-Z layout
259 simd4scalari
** pvDsts
= (simd4scalari
**)&ppDsts
[0];
260 *pvDsts
[0] = pPixSrc
[0];
261 *pvDsts
[1] = pPixSrc
[1];
262 *pvDsts
[2] = pPixSrc
[2];
263 *pvDsts
[3] = pPixSrc
[3];
267 #if USE_8x2_TILE_BACKEND
269 struct StorePixels
<64, 8>
271 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[8])
273 // 8 x 16 bytes = 128 bytes, 16 pixels
274 const simd4scalari
*pSrc128
= reinterpret_cast<const simd4scalari
*>(pSrc
);
276 simd4scalari
**ppDsts128
= reinterpret_cast<simd4scalari
**>(ppDsts
);
278 // order of pointers match SWR-Z layout
279 *ppDsts128
[0] = pSrc128
[0]; // 0 1
280 *ppDsts128
[1] = pSrc128
[1]; // 2 3
281 *ppDsts128
[2] = pSrc128
[2]; // 4 5
282 *ppDsts128
[3] = pSrc128
[3]; // 6 7
283 *ppDsts128
[4] = pSrc128
[4]; // 8 9
284 *ppDsts128
[5] = pSrc128
[5]; // A B
285 *ppDsts128
[6] = pSrc128
[6]; // C D
286 *ppDsts128
[7] = pSrc128
[7]; // E F
291 //////////////////////////////////////////////////////////////////////////
292 /// StorePixels (32-bit pixel specialization)
293 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
294 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
295 /// @param ppDsts - Array of destination pointers. Each pointer is
296 /// to a single row of at most 16B.
297 /// @tparam NumDests - Number of destination pointers. Each pair of
298 /// pointers is for a 16-byte column of two rows.
299 //////////////////////////////////////////////////////////////////////////
301 struct StorePixels
<128, 8>
303 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[8])
305 // Each 4-pixel row is 64 bytes.
306 const simd4scalari
* pPixSrc
= (const simd4scalari
*)pSrc
;
308 // Unswizzle from SWR-Z order
309 simd4scalari
** pvDsts
= (simd4scalari
**)&ppDsts
[0];
310 *pvDsts
[0] = pPixSrc
[0];
311 *pvDsts
[1] = pPixSrc
[2];
312 *pvDsts
[2] = pPixSrc
[1];
313 *pvDsts
[3] = pPixSrc
[3];
314 *pvDsts
[4] = pPixSrc
[4];
315 *pvDsts
[5] = pPixSrc
[6];
316 *pvDsts
[6] = pPixSrc
[5];
317 *pvDsts
[7] = pPixSrc
[7];
321 #if USE_8x2_TILE_BACKEND
323 struct StorePixels
<128, 16>
325 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[16])
327 // 16 x 16 bytes = 256 bytes, 16 pixels
328 const simd4scalari
*pSrc128
= reinterpret_cast<const simd4scalari
*>(pSrc
);
330 simd4scalari
**ppDsts128
= reinterpret_cast<simd4scalari
**>(ppDsts
);
332 for (uint32_t i
= 0; i
< 16; i
+= 4)
334 *ppDsts128
[i
+ 0] = pSrc128
[i
+ 0];
335 *ppDsts128
[i
+ 1] = pSrc128
[i
+ 2];
336 *ppDsts128
[i
+ 2] = pSrc128
[i
+ 1];
337 *ppDsts128
[i
+ 3] = pSrc128
[i
+ 3];
343 //////////////////////////////////////////////////////////////////////////
344 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
345 //////////////////////////////////////////////////////////////////////////
346 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
347 struct ConvertPixelsSOAtoAOS
349 //////////////////////////////////////////////////////////////////////////
350 /// @brief Converts a SIMD from the Hot Tile to the destination format
351 /// and converts from SOA to AOS.
352 /// @param pSrc - Pointer to raster tile.
353 /// @param pDst - Pointer to destination surface or deswizzling buffer.
354 template <size_t NumDests
>
355 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
357 #if USE_8x2_TILE_BACKEND
358 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
360 OSALIGNSIMD16(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
361 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
363 // Convert from SrcFormat --> DstFormat
365 LoadSOA
<SrcFormat
>(pSrc
, src
);
366 StoreSOA
<DstFormat
>(src
, soaTile
);
368 // Convert from SOA --> AOS
369 FormatTraits
<DstFormat
>::TransposeT::Transpose_16(soaTile
, aosTile
);
372 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
374 OSALIGNSIMD(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
375 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
377 // Convert from SrcFormat --> DstFormat
379 LoadSOA
<SrcFormat
>(pSrc
, src
);
380 StoreSOA
<DstFormat
>(src
, soaTile
);
382 // Convert from SOA --> AOS
383 FormatTraits
<DstFormat
>::TransposeT::Transpose(soaTile
, aosTile
);
386 // Store data into destination
387 StorePixels
<FormatTraits
<DstFormat
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
391 //////////////////////////////////////////////////////////////////////////
392 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
393 /// Specialization for no format conversion
394 //////////////////////////////////////////////////////////////////////////
395 template<SWR_FORMAT Format
>
396 struct ConvertPixelsSOAtoAOS
<Format
, Format
>
398 //////////////////////////////////////////////////////////////////////////
399 /// @brief Converts a SIMD from the Hot Tile to the destination format
400 /// and converts from SOA to AOS.
401 /// @param pSrc - Pointer to raster tile.
402 /// @param pDst - Pointer to destination surface or deswizzling buffer.
403 template <size_t NumDests
>
404 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
406 #if USE_8x2_TILE_BACKEND
407 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
409 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
411 // Convert from SOA --> AOS
412 FormatTraits
<Format
>::TransposeT::Transpose_16(pSrc
, aosTile
);
415 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
417 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
419 // Convert from SOA --> AOS
420 FormatTraits
<Format
>::TransposeT::Transpose(pSrc
, aosTile
);
423 // Store data into destination
424 StorePixels
<FormatTraits
<Format
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
428 //////////////////////////////////////////////////////////////////////////
429 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
430 //////////////////////////////////////////////////////////////////////////
432 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B5G6R5_UNORM
>
434 //////////////////////////////////////////////////////////////////////////
435 /// @brief Converts a SIMD from the Hot Tile to the destination format
436 /// and converts from SOA to AOS.
437 /// @param pSrc - Pointer to raster tile.
438 /// @param pDst - Pointer to destination surface or deswizzling buffer.
439 template <size_t NumDests
>
440 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
442 #if USE_8x2_TILE_BACKEND
443 static const SWR_FORMAT SrcFormat
= R32G32B32A32_FLOAT
;
444 static const SWR_FORMAT DstFormat
= B5G6R5_UNORM
;
446 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
448 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
451 simd16vector src
, dst
;
452 LoadSOA
<SrcFormat
>(pSrc
, src
);
455 dst
.x
= src
[FormatTraits
<DstFormat
>::swizzle(0)];
456 dst
.y
= src
[FormatTraits
<DstFormat
>::swizzle(1)];
457 dst
.z
= src
[FormatTraits
<DstFormat
>::swizzle(2)];
460 dst
.x
= Clamp
<DstFormat
>(dst
.x
, 0);
461 dst
.y
= Clamp
<DstFormat
>(dst
.y
, 1);
462 dst
.z
= Clamp
<DstFormat
>(dst
.z
, 2);
465 dst
.x
= Normalize
<DstFormat
>(dst
.x
, 0);
466 dst
.y
= Normalize
<DstFormat
>(dst
.y
, 1);
467 dst
.z
= Normalize
<DstFormat
>(dst
.z
, 2);
470 simd16scalari packed
= _simd16_castps_si(dst
.x
);
472 SWR_ASSERT(FormatTraits
<DstFormat
>::GetBPC(0) == 5);
473 SWR_ASSERT(FormatTraits
<DstFormat
>::GetBPC(1) == 6);
475 packed
= _simd16_or_si(packed
, _simd16_slli_epi32(_simd16_castps_si(dst
.y
), 5));
476 packed
= _simd16_or_si(packed
, _simd16_slli_epi32(_simd16_castps_si(dst
.z
), 5 + 6));
478 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
479 uint32_t *pPacked
= (uint32_t*)&packed
;
480 uint16_t *pAosTile
= (uint16_t*)&aosTile
[0];
481 for (uint32_t t
= 0; t
< KNOB_SIMD16_WIDTH
; ++t
)
483 *pAosTile
++ = *pPacked
++;
487 static const SWR_FORMAT SrcFormat
= R32G32B32A32_FLOAT
;
488 static const SWR_FORMAT DstFormat
= B5G6R5_UNORM
;
489 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
491 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
495 LoadSOA
<SrcFormat
>(pSrc
, src
);
498 dst
.x
= src
[FormatTraits
<DstFormat
>::swizzle(0)];
499 dst
.y
= src
[FormatTraits
<DstFormat
>::swizzle(1)];
500 dst
.z
= src
[FormatTraits
<DstFormat
>::swizzle(2)];
503 dst
.x
= Clamp
<DstFormat
>(dst
.x
, 0);
504 dst
.y
= Clamp
<DstFormat
>(dst
.y
, 1);
505 dst
.z
= Clamp
<DstFormat
>(dst
.z
, 2);
508 dst
.x
= Normalize
<DstFormat
>(dst
.x
, 0);
509 dst
.y
= Normalize
<DstFormat
>(dst
.y
, 1);
510 dst
.z
= Normalize
<DstFormat
>(dst
.z
, 2);
513 simdscalari packed
= _simd_castps_si(dst
.x
);
514 packed
= _simd_or_si(packed
, _simd_slli_epi32(_simd_castps_si(dst
.y
), FormatTraits
<DstFormat
>::GetConstBPC(0)));
515 packed
= _simd_or_si(packed
, _simd_slli_epi32(_simd_castps_si(dst
.z
), FormatTraits
<DstFormat
>::GetConstBPC(0) +
516 FormatTraits
<DstFormat
>::GetConstBPC(1)));
518 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
519 uint32_t *pPacked
= (uint32_t*)&packed
;
520 uint16_t *pAosTile
= (uint16_t*)&aosTile
[0];
521 for (uint32_t t
= 0; t
< KNOB_SIMD_WIDTH
; ++t
)
523 *pAosTile
++ = *pPacked
++;
527 // Store data into destination
528 StorePixels
<FormatTraits
<DstFormat
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
532 //////////////////////////////////////////////////////////////////////////
533 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
534 //////////////////////////////////////////////////////////////////////////
536 struct ConvertPixelsSOAtoAOS
<R32_FLOAT
, R24_UNORM_X8_TYPELESS
>
538 static const SWR_FORMAT SrcFormat
= R32_FLOAT
;
539 static const SWR_FORMAT DstFormat
= R24_UNORM_X8_TYPELESS
;
541 //////////////////////////////////////////////////////////////////////////
542 /// @brief Converts a SIMD from the Hot Tile to the destination format
543 /// and converts from SOA to AOS.
544 /// @param pSrc - Pointer to raster tile.
545 /// @param pDst - Pointer to destination surface or deswizzling buffer.
546 template <size_t NumDests
>
547 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
549 #if USE_8x2_TILE_BACKEND
550 simd16scalar comp
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
553 const simd16scalar zero
= _simd16_setzero_ps();
554 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
556 comp
= _simd16_max_ps(comp
, zero
);
557 comp
= _simd16_min_ps(comp
, ones
);
560 comp
= _simd16_mul_ps(comp
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
562 simd16scalari temp
= _simd16_cvtps_epi32(comp
);
565 temp
= _simd16_permute_epi32(temp
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
567 // merge/store data into destination but don't overwrite the X8 bits
568 simdscalari destlo
= _simd_loadu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[1]), reinterpret_cast<simd4scalari
*>(ppDsts
[0]));
569 simdscalari desthi
= _simd_loadu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[3]), reinterpret_cast<simd4scalari
*>(ppDsts
[2]));
571 simd16scalari dest
= _simd16_setzero_si();
573 dest
= _simd16_insert_si(dest
, destlo
, 0);
574 dest
= _simd16_insert_si(dest
, desthi
, 1);
576 simd16scalari mask
= _simd16_set1_epi32(0x00FFFFFF);
578 dest
= _simd16_or_si(_simd16_andnot_si(mask
, dest
), _simd16_and_si(mask
, temp
));
580 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[1]), reinterpret_cast<simd4scalari
*>(ppDsts
[0]), _simd16_extract_si(dest
, 0));
581 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(ppDsts
[3]), reinterpret_cast<simd4scalari
*>(ppDsts
[2]), _simd16_extract_si(dest
, 1));
583 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
585 OSALIGNSIMD(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
586 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
588 // Convert from SrcFormat --> DstFormat
590 LoadSOA
<SrcFormat
>(pSrc
, src
);
591 StoreSOA
<DstFormat
>(src
, soaTile
);
593 // Convert from SOA --> AOS
594 FormatTraits
<DstFormat
>::TransposeT::Transpose(soaTile
, aosTile
);
596 // Store data into destination but don't overwrite the X8 bits
597 // Each 4-pixel row is 16-bytes
598 simd4scalari
*pZRow01
= (simd4scalari
*)aosTile
;
599 simd4scalari vQuad00
= SIMD128::load_si(pZRow01
);
600 simd4scalari vQuad01
= SIMD128::load_si(pZRow01
+ 1);
602 simd4scalari vRow00
= SIMD128::unpacklo_epi64(vQuad00
, vQuad01
);
603 simd4scalari vRow10
= SIMD128::unpackhi_epi64(vQuad00
, vQuad01
);
605 simd4scalari vDst0
= SIMD128::loadu_si((const simd4scalari
*)ppDsts
[0]);
606 simd4scalari vDst1
= SIMD128::loadu_si((const simd4scalari
*)ppDsts
[1]);
608 simd4scalari vMask
= _mm_set1_epi32(0xFFFFFF);
610 vDst0
= SIMD128::andnot_si(vMask
, vDst0
);
611 vDst0
= SIMD128::or_si(vDst0
, SIMD128::and_si(vRow00
, vMask
));
612 vDst1
= SIMD128::andnot_si(vMask
, vDst1
);
613 vDst1
= SIMD128::or_si(vDst1
, SIMD128::and_si(vRow10
, vMask
));
615 SIMD128::storeu_si((simd4scalari
*)ppDsts
[0], vDst0
);
616 SIMD128::storeu_si((simd4scalari
*)ppDsts
[1], vDst1
);
621 #if USE_8x2_TILE_BACKEND
622 template<SWR_FORMAT DstFormat
>
623 INLINE
static void FlatConvert(const uint8_t* pSrc
, uint8_t* pDst0
, uint8_t* pDst1
, uint8_t* pDst2
, uint8_t* pDst3
)
625 // swizzle rgba -> bgra while we load
626 simd16scalar comp0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(0) * sizeof(simd16scalar
))); // float32 rrrrrrrrrrrrrrrr
627 simd16scalar comp1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(1) * sizeof(simd16scalar
))); // float32 gggggggggggggggg
628 simd16scalar comp2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(2) * sizeof(simd16scalar
))); // float32 bbbbbbbbbbbbbbbb
629 simd16scalar comp3
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(3) * sizeof(simd16scalar
))); // float32 aaaaaaaaaaaaaaaa
632 const simd16scalar zero
= _simd16_setzero_ps();
633 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
635 comp0
= _simd16_max_ps(comp0
, zero
);
636 comp0
= _simd16_min_ps(comp0
, ones
);
638 comp1
= _simd16_max_ps(comp1
, zero
);
639 comp1
= _simd16_min_ps(comp1
, ones
);
641 comp2
= _simd16_max_ps(comp2
, zero
);
642 comp2
= _simd16_min_ps(comp2
, ones
);
644 comp3
= _simd16_max_ps(comp3
, zero
);
645 comp3
= _simd16_min_ps(comp3
, ones
);
647 // gamma-correct only rgb
648 if (FormatTraits
<DstFormat
>::isSRGB
)
650 comp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, comp0
);
651 comp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, comp1
);
652 comp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, comp2
);
655 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
656 comp0
= _simd16_mul_ps(comp0
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
657 comp1
= _simd16_mul_ps(comp1
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
658 comp2
= _simd16_mul_ps(comp2
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
659 comp3
= _simd16_mul_ps(comp3
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(3)));
661 // moving to 16 wide integer vector types
662 simd16scalari src0
= _simd16_cvtps_epi32(comp0
); // padded byte rrrrrrrrrrrrrrrr
663 simd16scalari src1
= _simd16_cvtps_epi32(comp1
); // padded byte gggggggggggggggg
664 simd16scalari src2
= _simd16_cvtps_epi32(comp2
); // padded byte bbbbbbbbbbbbbbbb
665 simd16scalari src3
= _simd16_cvtps_epi32(comp3
); // padded byte aaaaaaaaaaaaaaaa
667 // SOA to AOS conversion
668 src1
= _simd16_slli_epi32(src1
, 8);
669 src2
= _simd16_slli_epi32(src2
, 16);
670 src3
= _simd16_slli_epi32(src3
, 24);
672 simd16scalari final
= _simd16_or_si(_simd16_or_si(src0
, src1
), _simd16_or_si(src2
, src3
)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
674 // de-swizzle conversion
676 simd16scalari final0
= _simd16_permute2f128_si(final
, final
, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
677 simd16scalari final1
= _simd16_permute2f128_si(final
, final
, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
679 final
= _simd16_shuffle_epi64(final0
, final1
, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
682 final
= _simd16_permute_epi32(final
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
685 // store 8x2 memory order:
686 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
687 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
688 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst1
), reinterpret_cast<simd4scalari
*>(pDst0
), _simd16_extract_si(final
, 0));
689 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst3
), reinterpret_cast<simd4scalari
*>(pDst2
), _simd16_extract_si(final
, 1));
693 template<SWR_FORMAT DstFormat
>
694 INLINE
static void FlatConvert(const uint8_t* pSrc
, uint8_t* pDst
, uint8_t* pDst1
)
696 static const uint32_t offset
= sizeof(simdscalar
);
698 // swizzle rgba -> bgra while we load
699 simdscalar vComp0
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(0))*offset
)); // float32 rrrrrrrr
700 simdscalar vComp1
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(1))*offset
)); // float32 gggggggg
701 simdscalar vComp2
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(2))*offset
)); // float32 bbbbbbbb
702 simdscalar vComp3
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(3))*offset
)); // float32 aaaaaaaa
705 vComp0
= _simd_max_ps(vComp0
, _simd_setzero_ps());
706 vComp0
= _simd_min_ps(vComp0
, _simd_set1_ps(1.0f
));
708 vComp1
= _simd_max_ps(vComp1
, _simd_setzero_ps());
709 vComp1
= _simd_min_ps(vComp1
, _simd_set1_ps(1.0f
));
711 vComp2
= _simd_max_ps(vComp2
, _simd_setzero_ps());
712 vComp2
= _simd_min_ps(vComp2
, _simd_set1_ps(1.0f
));
714 vComp3
= _simd_max_ps(vComp3
, _simd_setzero_ps());
715 vComp3
= _simd_min_ps(vComp3
, _simd_set1_ps(1.0f
));
717 if (FormatTraits
<DstFormat
>::isSRGB
)
719 // Gamma-correct only rgb
720 vComp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, vComp0
);
721 vComp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, vComp1
);
722 vComp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, vComp2
);
725 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
726 vComp0
= _simd_mul_ps(vComp0
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
727 vComp1
= _simd_mul_ps(vComp1
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
728 vComp2
= _simd_mul_ps(vComp2
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
729 vComp3
= _simd_mul_ps(vComp3
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(3)));
731 // moving to 8 wide integer vector types
732 simdscalari src0
= _simd_cvtps_epi32(vComp0
); // padded byte rrrrrrrr
733 simdscalari src1
= _simd_cvtps_epi32(vComp1
); // padded byte gggggggg
734 simdscalari src2
= _simd_cvtps_epi32(vComp2
); // padded byte bbbbbbbb
735 simdscalari src3
= _simd_cvtps_epi32(vComp3
); // padded byte aaaaaaaa
737 #if KNOB_ARCH <= KNOB_ARCH_AVX
739 // splitting into two sets of 4 wide integer vector types
740 // because AVX doesn't have instructions to support this operation at 8 wide
741 simd4scalari srcLo0
= _mm256_castsi256_si128(src0
); // 000r000r000r000r
742 simd4scalari srcLo1
= _mm256_castsi256_si128(src1
); // 000g000g000g000g
743 simd4scalari srcLo2
= _mm256_castsi256_si128(src2
); // 000b000b000b000b
744 simd4scalari srcLo3
= _mm256_castsi256_si128(src3
); // 000a000a000a000a
746 simd4scalari srcHi0
= _mm256_extractf128_si256(src0
, 1); // 000r000r000r000r
747 simd4scalari srcHi1
= _mm256_extractf128_si256(src1
, 1); // 000g000g000g000g
748 simd4scalari srcHi2
= _mm256_extractf128_si256(src2
, 1); // 000b000b000b000b
749 simd4scalari srcHi3
= _mm256_extractf128_si256(src3
, 1); // 000a000a000a000a
751 srcLo1
= _mm_slli_si128(srcLo1
, 1); // 00g000g000g000g0
752 srcHi1
= _mm_slli_si128(srcHi1
, 1); // 00g000g000g000g0
753 srcLo2
= _mm_slli_si128(srcLo2
, 2); // 0b000b000b000b00
754 srcHi2
= _mm_slli_si128(srcHi2
, 2); // 0b000b000b000b00
755 srcLo3
= _mm_slli_si128(srcLo3
, 3); // a000a000a000a000
756 srcHi3
= _mm_slli_si128(srcHi3
, 3); // a000a000a000a000
758 srcLo0
= SIMD128::or_si(srcLo0
, srcLo1
); // 00gr00gr00gr00gr
759 srcLo2
= SIMD128::or_si(srcLo2
, srcLo3
); // ab00ab00ab00ab00
761 srcHi0
= SIMD128::or_si(srcHi0
, srcHi1
); // 00gr00gr00gr00gr
762 srcHi2
= SIMD128::or_si(srcHi2
, srcHi3
); // ab00ab00ab00ab00
764 srcLo0
= SIMD128::or_si(srcLo0
, srcLo2
); // abgrabgrabgrabgr
765 srcHi0
= SIMD128::or_si(srcHi0
, srcHi2
); // abgrabgrabgrabgr
767 // unpack into rows that get the tiling order correct
768 simd4scalari vRow00
= SIMD128::unpacklo_epi64(srcLo0
, srcHi0
); // abgrabgrabgrabgrabgrabgrabgrabgr
769 simd4scalari vRow10
= SIMD128::unpackhi_epi64(srcLo0
, srcHi0
);
771 simdscalari final
= _mm256_castsi128_si256(vRow00
);
772 final
= _mm256_insertf128_si256(final
, vRow10
, 1);
776 // logic is as above, only wider
777 src1
= _mm256_slli_si256(src1
, 1);
778 src2
= _mm256_slli_si256(src2
, 2);
779 src3
= _mm256_slli_si256(src3
, 3);
781 src0
= _mm256_or_si256(src0
, src1
);
782 src2
= _mm256_or_si256(src2
, src3
);
784 simdscalari final
= _mm256_or_si256(src0
, src2
);
786 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
787 final
= _mm256_permute4x64_epi64(final
, 0xD8);
790 _simd_storeu2_si((simd4scalari
*)pDst1
, (simd4scalari
*)pDst
, final
);
793 #if USE_8x2_TILE_BACKEND
794 template<SWR_FORMAT DstFormat
>
795 INLINE
static void FlatConvertNoAlpha(const uint8_t* pSrc
, uint8_t* pDst0
, uint8_t* pDst1
, uint8_t* pDst2
, uint8_t* pDst3
)
797 // swizzle rgba -> bgra while we load
798 simd16scalar comp0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(0) * sizeof(simd16scalar
))); // float32 rrrrrrrrrrrrrrrr
799 simd16scalar comp1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(1) * sizeof(simd16scalar
))); // float32 gggggggggggggggg
800 simd16scalar comp2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(2) * sizeof(simd16scalar
))); // float32 bbbbbbbbbbbbbbbb
803 const simd16scalar zero
= _simd16_setzero_ps();
804 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
806 comp0
= _simd16_max_ps(comp0
, zero
);
807 comp0
= _simd16_min_ps(comp0
, ones
);
809 comp1
= _simd16_max_ps(comp1
, zero
);
810 comp1
= _simd16_min_ps(comp1
, ones
);
812 comp2
= _simd16_max_ps(comp2
, zero
);
813 comp2
= _simd16_min_ps(comp2
, ones
);
815 // gamma-correct only rgb
816 if (FormatTraits
<DstFormat
>::isSRGB
)
818 comp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, comp0
);
819 comp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, comp1
);
820 comp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, comp2
);
823 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
824 comp0
= _simd16_mul_ps(comp0
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
825 comp1
= _simd16_mul_ps(comp1
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
826 comp2
= _simd16_mul_ps(comp2
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
828 // moving to 16 wide integer vector types
829 simd16scalari src0
= _simd16_cvtps_epi32(comp0
); // padded byte rrrrrrrrrrrrrrrr
830 simd16scalari src1
= _simd16_cvtps_epi32(comp1
); // padded byte gggggggggggggggg
831 simd16scalari src2
= _simd16_cvtps_epi32(comp2
); // padded byte bbbbbbbbbbbbbbbb
833 // SOA to AOS conversion
834 src1
= _simd16_slli_epi32(src1
, 8);
835 src2
= _simd16_slli_epi32(src2
, 16);
837 simd16scalari final
= _simd16_or_si(_simd16_or_si(src0
, src1
), src2
); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
839 // de-swizzle conversion
841 simd16scalari final0
= _simd16_permute2f128_si(final
, final
, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
842 simd16scalari final1
= _simd16_permute2f128_si(final
, final
, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
844 final
= _simd16_shuffle_epi64(final0
, final1
, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
847 final
= _simd16_permute_epi32(final
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
850 // store 8x2 memory order:
851 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
852 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
853 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst1
), reinterpret_cast<simd4scalari
*>(pDst0
), _simd16_extract_si(final
, 0));
854 _simd_storeu2_si(reinterpret_cast<simd4scalari
*>(pDst3
), reinterpret_cast<simd4scalari
*>(pDst2
), _simd16_extract_si(final
, 1));
858 template<SWR_FORMAT DstFormat
>
859 INLINE
static void FlatConvertNoAlpha(const uint8_t* pSrc
, uint8_t* pDst
, uint8_t* pDst1
)
861 static const uint32_t offset
= sizeof(simdscalar
);
863 // swizzle rgba -> bgra while we load
864 simdscalar vComp0
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(0))*offset
)); // float32 rrrrrrrr
865 simdscalar vComp1
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(1))*offset
)); // float32 gggggggg
866 simdscalar vComp2
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(2))*offset
)); // float32 bbbbbbbb
868 vComp0
= _simd_max_ps(vComp0
, _simd_setzero_ps());
869 vComp0
= _simd_min_ps(vComp0
, _simd_set1_ps(1.0f
));
871 vComp1
= _simd_max_ps(vComp1
, _simd_setzero_ps());
872 vComp1
= _simd_min_ps(vComp1
, _simd_set1_ps(1.0f
));
874 vComp2
= _simd_max_ps(vComp2
, _simd_setzero_ps());
875 vComp2
= _simd_min_ps(vComp2
, _simd_set1_ps(1.0f
));
877 if (FormatTraits
<DstFormat
>::isSRGB
)
879 // Gamma-correct only rgb
880 vComp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, vComp0
);
881 vComp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, vComp1
);
882 vComp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, vComp2
);
885 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
886 vComp0
= _simd_mul_ps(vComp0
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
887 vComp1
= _simd_mul_ps(vComp1
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
888 vComp2
= _simd_mul_ps(vComp2
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
890 // moving to 8 wide integer vector types
891 simdscalari src0
= _simd_cvtps_epi32(vComp0
); // padded byte rrrrrrrr
892 simdscalari src1
= _simd_cvtps_epi32(vComp1
); // padded byte gggggggg
893 simdscalari src2
= _simd_cvtps_epi32(vComp2
); // padded byte bbbbbbbb
895 #if KNOB_ARCH <= KNOB_ARCH_AVX
897 // splitting into two sets of 4 wide integer vector types
898 // because AVX doesn't have instructions to support this operation at 8 wide
899 simd4scalari srcLo0
= _mm256_castsi256_si128(src0
); // 000r000r000r000r
900 simd4scalari srcLo1
= _mm256_castsi256_si128(src1
); // 000g000g000g000g
901 simd4scalari srcLo2
= _mm256_castsi256_si128(src2
); // 000b000b000b000b
903 simd4scalari srcHi0
= _mm256_extractf128_si256(src0
, 1); // 000r000r000r000r
904 simd4scalari srcHi1
= _mm256_extractf128_si256(src1
, 1); // 000g000g000g000g
905 simd4scalari srcHi2
= _mm256_extractf128_si256(src2
, 1); // 000b000b000b000b
907 srcLo1
= _mm_slli_si128(srcLo1
, 1); // 00g000g000g000g0
908 srcHi1
= _mm_slli_si128(srcHi1
, 1); // 00g000g000g000g0
909 srcLo2
= _mm_slli_si128(srcLo2
, 2); // 0b000b000b000b00
910 srcHi2
= _mm_slli_si128(srcHi2
, 2); // 0b000b000b000b00
912 srcLo0
= SIMD128::or_si(srcLo0
, srcLo1
); // 00gr00gr00gr00gr
914 srcHi0
= SIMD128::or_si(srcHi0
, srcHi1
); // 00gr00gr00gr00gr
916 srcLo0
= SIMD128::or_si(srcLo0
, srcLo2
); // 0bgr0bgr0bgr0bgr
917 srcHi0
= SIMD128::or_si(srcHi0
, srcHi2
); // 0bgr0bgr0bgr0bgr
919 // unpack into rows that get the tiling order correct
920 simd4scalari vRow00
= SIMD128::unpacklo_epi64(srcLo0
, srcHi0
); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
921 simd4scalari vRow10
= SIMD128::unpackhi_epi64(srcLo0
, srcHi0
);
923 simdscalari final
= _mm256_castsi128_si256(vRow00
);
924 final
= _mm256_insertf128_si256(final
, vRow10
, 1);
928 // logic is as above, only wider
929 src1
= _mm256_slli_si256(src1
, 1);
930 src2
= _mm256_slli_si256(src2
, 2);
932 src0
= _mm256_or_si256(src0
, src1
);
934 simdscalari final
= _mm256_or_si256(src0
, src2
);
936 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
937 final
= _mm256_permute4x64_epi64(final
, 0xD8);
941 _simd_storeu2_si((simd4scalari
*)pDst1
, (simd4scalari
*)pDst
, final
);
945 struct ConvertPixelsSOAtoAOS
<R32G32B32A32_FLOAT
, B8G8R8A8_UNORM
>
947 template <size_t NumDests
>
948 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
950 #if USE_8x2_TILE_BACKEND
951 FlatConvert
<B8G8R8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
953 FlatConvert
<B8G8R8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
959 struct ConvertPixelsSOAtoAOS
<R32G32B32A32_FLOAT
, B8G8R8X8_UNORM
>
961 template <size_t NumDests
>
962 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
964 #if USE_8x2_TILE_BACKEND
965 FlatConvertNoAlpha
<B8G8R8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
967 FlatConvertNoAlpha
<B8G8R8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
973 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B8G8R8A8_UNORM_SRGB
>
975 template <size_t NumDests
>
976 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
978 #if USE_8x2_TILE_BACKEND
979 FlatConvert
<B8G8R8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
981 FlatConvert
<B8G8R8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
987 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B8G8R8X8_UNORM_SRGB
>
989 template <size_t NumDests
>
990 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
992 #if USE_8x2_TILE_BACKEND
993 FlatConvertNoAlpha
<B8G8R8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
995 FlatConvertNoAlpha
<B8G8R8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1001 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8A8_UNORM
>
1003 template <size_t NumDests
>
1004 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1006 #if USE_8x2_TILE_BACKEND
1007 FlatConvert
<R8G8B8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1009 FlatConvert
<R8G8B8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1015 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8X8_UNORM
>
1017 template <size_t NumDests
>
1018 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1020 #if USE_8x2_TILE_BACKEND
1021 FlatConvertNoAlpha
<R8G8B8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1023 FlatConvertNoAlpha
<R8G8B8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1029 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8A8_UNORM_SRGB
>
1031 template <size_t NumDests
>
1032 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1034 #if USE_8x2_TILE_BACKEND
1035 FlatConvert
<R8G8B8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1037 FlatConvert
<R8G8B8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1043 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8X8_UNORM_SRGB
>
1045 template <size_t NumDests
>
1046 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1048 #if USE_8x2_TILE_BACKEND
1049 FlatConvertNoAlpha
<R8G8B8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1051 FlatConvertNoAlpha
<R8G8B8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1056 //////////////////////////////////////////////////////////////////////////
1058 //////////////////////////////////////////////////////////////////////////
1059 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1060 struct StoreRasterTile
1062 //////////////////////////////////////////////////////////////////////////
1063 /// @brief Retrieve color from hot tile source which is always float.
1064 /// @param pSrc - Pointer to raster tile.
1065 /// @param x, y - Coordinates to raster tile.
1066 /// @param output - output color
1067 INLINE
static void GetSwizzledSrcColor(
1069 uint32_t x
, uint32_t y
,
1070 float outputColor
[4])
1072 #if USE_8x2_TILE_BACKEND
1073 typedef SimdTile_16
<SrcFormat
, DstFormat
> SimdT
;
1075 SimdT
*pSrcSimdTiles
= reinterpret_cast<SimdT
*>(pSrc
);
1077 // Compute which simd tile we're accessing within 8x8 tile.
1078 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1079 uint32_t simdIndex
= (y
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
) + (x
/ SIMD16_TILE_X_DIM
);
1081 SimdT
*pSimdTile
= &pSrcSimdTiles
[simdIndex
];
1083 uint32_t simdOffset
= (y
% SIMD16_TILE_Y_DIM
) * SIMD16_TILE_X_DIM
+ (x
% SIMD16_TILE_X_DIM
);
1085 pSimdTile
->GetSwizzledColor(simdOffset
, outputColor
);
1087 typedef SimdTile
<SrcFormat
, DstFormat
> SimdT
;
1089 SimdT
* pSrcSimdTiles
= (SimdT
*)pSrc
;
1091 // Compute which simd tile we're accessing within 8x8 tile.
1092 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1093 uint32_t simdIndex
= (y
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
) + (x
/ SIMD_TILE_X_DIM
);
1095 SimdT
* pSimdTile
= &pSrcSimdTiles
[simdIndex
];
1097 uint32_t simdOffset
= (y
% SIMD_TILE_Y_DIM
) * SIMD_TILE_X_DIM
+ (x
% SIMD_TILE_X_DIM
);
1099 pSimdTile
->GetSwizzledColor(simdOffset
, outputColor
);
1103 //////////////////////////////////////////////////////////////////////////
1104 /// @brief Stores an 8x8 raster tile to the destination surface.
1105 /// @param pSrc - Pointer to raster tile.
1106 /// @param pDstSurface - Destination surface state
1107 /// @param x, y - Coordinates to raster tile.
1108 INLINE
static void Store(
1110 SWR_SURFACE_STATE
* pDstSurface
,
1111 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
) // (x, y) pixel coordinate to start of raster tile.
1113 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1114 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1116 // For each raster tile pixel (rx, ry)
1117 for (uint32_t ry
= 0; ry
< KNOB_TILE_Y_DIM
; ++ry
)
1119 for (uint32_t rx
= 0; rx
< KNOB_TILE_X_DIM
; ++rx
)
1121 // Perform bounds checking.
1122 if (((x
+ rx
) < lodWidth
) &&
1123 ((y
+ ry
) < lodHeight
))
1126 GetSwizzledSrcColor(pSrc
, rx
, ry
, srcColor
);
1128 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>((x
+ rx
), (y
+ ry
),
1129 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1130 sampleNum
, pDstSurface
->lod
, pDstSurface
);
1132 ConvertPixelFromFloat
<DstFormat
>(pDst
, srcColor
);
1139 //////////////////////////////////////////////////////////////////////////
1140 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
1141 /// @param pSrc - Pointer to raster tile.
1142 /// @param pDstSurface - Destination surface state
1143 /// @param x, y - Coordinates to raster tile.
1144 /// @param sampleOffset - Offset between adjacent multisamples
1145 INLINE
static void Resolve(
1147 SWR_SURFACE_STATE
* pDstSurface
,
1148 uint32_t x
, uint32_t y
, uint32_t sampleOffset
, uint32_t renderTargetArrayIndex
) // (x, y) pixel coordinate to start of raster tile.
1150 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1151 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1153 float oneOverNumSamples
= 1.0f
/ pDstSurface
->numSamples
;
1155 // For each raster tile pixel (rx, ry)
1156 for (uint32_t ry
= 0; ry
< KNOB_TILE_Y_DIM
; ++ry
)
1158 for (uint32_t rx
= 0; rx
< KNOB_TILE_X_DIM
; ++rx
)
1160 // Perform bounds checking.
1161 if (((x
+ rx
) < lodWidth
) &&
1162 ((y
+ ry
) < lodHeight
))
1164 // Sum across samples
1165 float resolveColor
[4] = {0};
1166 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
1168 float sampleColor
[4] = {0};
1169 uint8_t *pSampleSrc
= pSrc
+ sampleOffset
* sampleNum
;
1170 GetSwizzledSrcColor(pSampleSrc
, rx
, ry
, sampleColor
);
1171 resolveColor
[0] += sampleColor
[0];
1172 resolveColor
[1] += sampleColor
[1];
1173 resolveColor
[2] += sampleColor
[2];
1174 resolveColor
[3] += sampleColor
[3];
1177 // Divide by numSamples to average
1178 resolveColor
[0] *= oneOverNumSamples
;
1179 resolveColor
[1] *= oneOverNumSamples
;
1180 resolveColor
[2] *= oneOverNumSamples
;
1181 resolveColor
[3] *= oneOverNumSamples
;
1183 // Use the resolve surface state
1184 SWR_SURFACE_STATE
* pResolveSurface
= (SWR_SURFACE_STATE
*)pDstSurface
->xpAuxBaseAddress
;
1185 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>((x
+ rx
), (y
+ ry
),
1186 pResolveSurface
->arrayIndex
+ renderTargetArrayIndex
, pResolveSurface
->arrayIndex
+ renderTargetArrayIndex
,
1187 0, pResolveSurface
->lod
, pResolveSurface
);
1189 ConvertPixelFromFloat
<DstFormat
>(pDst
, resolveColor
);
1198 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1199 struct OptStoreRasterTile
: StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>
1202 //////////////////////////////////////////////////////////////////////////
1203 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1204 //////////////////////////////////////////////////////////////////////////
1205 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1206 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 8>, SrcFormat
, DstFormat
>
1208 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 8>, SrcFormat
, DstFormat
> GenericStoreTile
;
1209 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1210 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1212 //////////////////////////////////////////////////////////////////////////
1213 /// @brief Stores an 8x8 raster tile to the destination surface.
1214 /// @param pSrc - Pointer to raster tile.
1215 /// @param pDstSurface - Destination surface state
1216 /// @param x, y - Coordinates to raster tile.
1217 INLINE
static void Store(
1219 SWR_SURFACE_STATE
* pDstSurface
,
1220 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1222 // Punt non-full tiles to generic store
1223 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1224 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1226 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1228 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1231 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1232 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1233 #if USE_8x2_TILE_BACKEND
1235 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1236 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1240 pDst
, // row 0, col 0
1241 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1242 pDst
+ dx
/ 2, // row 0, col 1
1243 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1246 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1248 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1250 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1252 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1266 uint8_t* ppRows
[] = { pDst
, pDst
+ pDstSurface
->pitch
};
1268 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1270 uint8_t* ppStartRows
[] = { ppRows
[0], ppRows
[1] };
1272 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1274 // Format conversion and convert from SOA to AOS, and store the rows.
1275 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppRows
);
1277 ppRows
[0] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1278 ppRows
[1] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1279 pSrc
+= SRC_BYTES_PER_PIXEL
* KNOB_SIMD_WIDTH
;
1282 ppRows
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1283 ppRows
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1289 //////////////////////////////////////////////////////////////////////////
1290 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1291 //////////////////////////////////////////////////////////////////////////
1292 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1293 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 16>, SrcFormat
, DstFormat
>
1295 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 16>, SrcFormat
, DstFormat
> GenericStoreTile
;
1296 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1297 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1299 //////////////////////////////////////////////////////////////////////////
1300 /// @brief Stores an 8x8 raster tile to the destination surface.
1301 /// @param pSrc - Pointer to raster tile.
1302 /// @param pDstSurface - Destination surface state
1303 /// @param x, y - Coordinates to raster tile.
1304 INLINE
static void Store(
1306 SWR_SURFACE_STATE
* pDstSurface
,
1307 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1309 // Punt non-full tiles to generic store
1310 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1311 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1313 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1315 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1318 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1319 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1320 #if USE_8x2_TILE_BACKEND
1322 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1323 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1327 pDst
, // row 0, col 0
1328 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1329 pDst
+ dx
/ 2, // row 0, col 1
1330 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1333 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1335 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1337 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1339 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1353 uint8_t* ppRows
[] = { pDst
, pDst
+ pDstSurface
->pitch
};
1355 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1357 uint8_t* ppStartRows
[] = { ppRows
[0], ppRows
[1] };
1359 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1361 // Format conversion and convert from SOA to AOS, and store the rows.
1362 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppRows
);
1364 ppRows
[0] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1365 ppRows
[1] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1366 pSrc
+= SRC_BYTES_PER_PIXEL
* KNOB_SIMD_WIDTH
;
1369 ppRows
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1370 ppRows
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1376 //////////////////////////////////////////////////////////////////////////
1377 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1378 //////////////////////////////////////////////////////////////////////////
1379 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1380 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 32>, SrcFormat
, DstFormat
>
1382 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1383 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1384 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1386 //////////////////////////////////////////////////////////////////////////
1387 /// @brief Stores an 8x8 raster tile to the destination surface.
1388 /// @param pSrc - Pointer to raster tile.
1389 /// @param pDstSurface - Destination surface state
1390 /// @param x, y - Coordinates to raster tile.
1391 INLINE
static void Store(
1393 SWR_SURFACE_STATE
* pDstSurface
,
1394 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1396 // Punt non-full tiles to generic store
1397 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1398 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1400 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1402 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1405 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1406 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1407 #if USE_8x2_TILE_BACKEND
1409 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1410 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1414 pDst
, // row 0, col 0
1415 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1416 pDst
+ dx
/ 2, // row 0, col 1
1417 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1420 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1422 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1424 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1426 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1440 uint8_t* ppRows
[] = { pDst
, pDst
+ pDstSurface
->pitch
};
1442 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1444 uint8_t* ppStartRows
[] = { ppRows
[0], ppRows
[1] };
1446 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1448 // Format conversion and convert from SOA to AOS, and store the rows.
1449 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppRows
);
1451 ppRows
[0] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1452 ppRows
[1] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1453 pSrc
+= SRC_BYTES_PER_PIXEL
* KNOB_SIMD_WIDTH
;
1456 ppRows
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1457 ppRows
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1463 //////////////////////////////////////////////////////////////////////////
1464 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1465 //////////////////////////////////////////////////////////////////////////
1466 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1467 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 64>, SrcFormat
, DstFormat
>
1469 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 64>, SrcFormat
, DstFormat
> GenericStoreTile
;
1470 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1471 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1472 static const size_t MAX_DST_COLUMN_BYTES
= 16;
1473 #if !USE_8x2_TILE_BACKEND
1474 static const size_t SRC_COLUMN_BYTES
= KNOB_SIMD_WIDTH
* SRC_BYTES_PER_PIXEL
;
1475 static const size_t DST_COLUMN_BYTES_PER_SRC
= KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1478 //////////////////////////////////////////////////////////////////////////
1479 /// @brief Stores an 8x8 raster tile to the destination surface.
1480 /// @param pSrc - Pointer to raster tile.
1481 /// @param pDstSurface - Destination surface state
1482 /// @param x, y - Coordinates to raster tile.
1483 INLINE
static void Store(
1485 SWR_SURFACE_STATE
* pDstSurface
,
1486 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1488 // Punt non-full tiles to generic store
1489 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1490 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1492 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1494 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1497 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1498 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1499 #if USE_8x2_TILE_BACKEND
1501 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1502 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
;
1504 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1505 static_assert(dx
== MAX_DST_COLUMN_BYTES
* 4, "Invalid column offsets");
1509 pDst
, // row 0, col 0
1510 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1511 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1512 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1513 pDst
+ MAX_DST_COLUMN_BYTES
* 2, // row 0, col 2
1514 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 2, // row 1, col 2
1515 pDst
+ MAX_DST_COLUMN_BYTES
* 3, // row 0, col 3
1516 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 3 // row 1, col 3
1519 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1521 // Raster tile width is same as simd16 tile width
1522 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1524 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1526 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1528 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
1536 pDst
, // row 0, col 0
1537 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1538 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1539 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1542 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1544 uint8_t* ppStartRows
[] =
1552 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1554 // Format conversion and convert from SOA to AOS, and store the rows.
1555 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1557 ppDsts
[0] += DST_COLUMN_BYTES_PER_SRC
;
1558 ppDsts
[1] += DST_COLUMN_BYTES_PER_SRC
;
1559 ppDsts
[2] += DST_COLUMN_BYTES_PER_SRC
;
1560 ppDsts
[3] += DST_COLUMN_BYTES_PER_SRC
;
1561 pSrc
+= SRC_COLUMN_BYTES
;
1564 ppDsts
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1565 ppDsts
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1566 ppDsts
[2] = ppStartRows
[2] + 2 * pDstSurface
->pitch
;
1567 ppDsts
[3] = ppStartRows
[3] + 2 * pDstSurface
->pitch
;
1573 //////////////////////////////////////////////////////////////////////////
1574 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1575 //////////////////////////////////////////////////////////////////////////
1576 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1577 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 128>, SrcFormat
, DstFormat
>
1579 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 128>, SrcFormat
, DstFormat
> GenericStoreTile
;
1580 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1581 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1582 static const size_t MAX_DST_COLUMN_BYTES
= 16;
1583 #if !USE_8x2_TILE_BACKEND
1584 static const size_t SRC_COLUMN_BYTES
= KNOB_SIMD_WIDTH
* SRC_BYTES_PER_PIXEL
;
1585 static const size_t DST_COLUMN_BYTES_PER_SRC
= KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1588 //////////////////////////////////////////////////////////////////////////
1589 /// @brief Stores an 8x8 raster tile to the destination surface.
1590 /// @param pSrc - Pointer to raster tile.
1591 /// @param pDstSurface - Destination surface state
1592 /// @param x, y - Coordinates to raster tile.
1593 INLINE
static void Store(
1595 SWR_SURFACE_STATE
* pDstSurface
,
1596 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1598 // Punt non-full tiles to generic store
1599 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1600 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1602 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1604 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1607 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1608 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1609 #if USE_8x2_TILE_BACKEND
1611 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1612 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
;
1614 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1615 static_assert(dx
== MAX_DST_COLUMN_BYTES
* 8, "Invalid column offsets");
1619 pDst
, // row 0, col 0
1620 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1621 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1622 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1623 pDst
+ MAX_DST_COLUMN_BYTES
* 2, // row 0, col 2
1624 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 2, // row 1, col 2
1625 pDst
+ MAX_DST_COLUMN_BYTES
* 3, // row 0, col 3
1626 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 3, // row 1, col 3
1627 pDst
+ MAX_DST_COLUMN_BYTES
* 4, // row 0, col 4
1628 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 4, // row 1, col 4
1629 pDst
+ MAX_DST_COLUMN_BYTES
* 5, // row 0, col 5
1630 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 5, // row 1, col 5
1631 pDst
+ MAX_DST_COLUMN_BYTES
* 6, // row 0, col 6
1632 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 6, // row 1, col 6
1633 pDst
+ MAX_DST_COLUMN_BYTES
* 7, // row 0, col 7
1634 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 7, // row 1, col 7
1637 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1639 // Raster tile width is same as simd16 tile width
1640 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1642 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1644 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1646 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
1657 // Need 8 pointers, 4 columns of 2 rows each
1658 for (uint32_t y
= 0; y
< 2; ++y
)
1660 for (uint32_t x
= 0; x
< 4; ++x
)
1662 ptrs
.ppDsts
[x
* 2 + y
] = pDst
+ y
* pDstSurface
->pitch
+ x
* MAX_DST_COLUMN_BYTES
;
1666 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1668 DstPtrs startPtrs
= ptrs
;
1670 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1672 // Format conversion and convert from SOA to AOS, and store the rows.
1673 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ptrs
.ppDsts
);
1675 ptrs
.ppDsts
[0] += DST_COLUMN_BYTES_PER_SRC
;
1676 ptrs
.ppDsts
[1] += DST_COLUMN_BYTES_PER_SRC
;
1677 ptrs
.ppDsts
[2] += DST_COLUMN_BYTES_PER_SRC
;
1678 ptrs
.ppDsts
[3] += DST_COLUMN_BYTES_PER_SRC
;
1679 ptrs
.ppDsts
[4] += DST_COLUMN_BYTES_PER_SRC
;
1680 ptrs
.ppDsts
[5] += DST_COLUMN_BYTES_PER_SRC
;
1681 ptrs
.ppDsts
[6] += DST_COLUMN_BYTES_PER_SRC
;
1682 ptrs
.ppDsts
[7] += DST_COLUMN_BYTES_PER_SRC
;
1683 pSrc
+= SRC_COLUMN_BYTES
;
1686 ptrs
.ppDsts
[0] = startPtrs
.ppDsts
[0] + 2 * pDstSurface
->pitch
;
1687 ptrs
.ppDsts
[1] = startPtrs
.ppDsts
[1] + 2 * pDstSurface
->pitch
;
1688 ptrs
.ppDsts
[2] = startPtrs
.ppDsts
[2] + 2 * pDstSurface
->pitch
;
1689 ptrs
.ppDsts
[3] = startPtrs
.ppDsts
[3] + 2 * pDstSurface
->pitch
;
1690 ptrs
.ppDsts
[4] = startPtrs
.ppDsts
[4] + 2 * pDstSurface
->pitch
;
1691 ptrs
.ppDsts
[5] = startPtrs
.ppDsts
[5] + 2 * pDstSurface
->pitch
;
1692 ptrs
.ppDsts
[6] = startPtrs
.ppDsts
[6] + 2 * pDstSurface
->pitch
;
1693 ptrs
.ppDsts
[7] = startPtrs
.ppDsts
[7] + 2 * pDstSurface
->pitch
;
1699 //////////////////////////////////////////////////////////////////////////
1700 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1701 //////////////////////////////////////////////////////////////////////////
1702 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1703 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 8>, SrcFormat
, DstFormat
>
1705 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 8>, SrcFormat
, DstFormat
> GenericStoreTile
;
1706 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1708 //////////////////////////////////////////////////////////////////////////
1709 /// @brief Stores an 8x8 raster tile to the destination surface.
1710 /// @param pSrc - Pointer to raster tile.
1711 /// @param pDstSurface - Destination surface state
1712 /// @param x, y - Coordinates to raster tile.
1713 INLINE
static void Store(
1715 SWR_SURFACE_STATE
* pDstSurface
,
1716 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1718 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1720 // Punt non-full tiles to generic store
1721 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1722 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1724 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1726 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1729 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1730 // We can compute the offsets to each column within the raster tile once and increment from these.
1731 #if USE_8x2_TILE_BACKEND
1732 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1733 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1734 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1736 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1738 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1742 pDst
+ DestRowWidthBytes
,
1743 pDst
+ DestRowWidthBytes
/ 4,
1744 pDst
+ DestRowWidthBytes
+ DestRowWidthBytes
/ 4
1747 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1749 // Raster tile width is same as simd16 tile width
1750 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1752 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1754 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1762 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1763 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1764 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1766 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1767 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
1769 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1770 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
1772 uint32_t rowOffset
= row
* DestRowWidthBytes
;
1774 uint8_t* pRow
= pCol0
+ rowOffset
;
1775 uint8_t* ppDsts
[] = { pRow
, pRow
+ DestRowWidthBytes
};
1777 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1780 ppDsts
[0] += DestRowWidthBytes
/ 4;
1781 ppDsts
[1] += DestRowWidthBytes
/ 4;
1783 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1790 //////////////////////////////////////////////////////////////////////////
1791 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1792 //////////////////////////////////////////////////////////////////////////
1793 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1794 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 16>, SrcFormat
, DstFormat
>
1796 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 16>, SrcFormat
, DstFormat
> GenericStoreTile
;
1797 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1799 //////////////////////////////////////////////////////////////////////////
1800 /// @brief Stores an 8x8 raster tile to the destination surface.
1801 /// @param pSrc - Pointer to raster tile.
1802 /// @param pDstSurface - Destination surface state
1803 /// @param x, y - Coordinates to raster tile.
1804 INLINE
static void Store(
1806 SWR_SURFACE_STATE
* pDstSurface
,
1807 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1809 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1811 // Punt non-full tiles to generic store
1812 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1813 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1815 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1817 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1820 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1821 // We can compute the offsets to each column within the raster tile once and increment from these.
1822 #if USE_8x2_TILE_BACKEND
1823 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1824 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1825 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1827 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1829 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1833 pDst
+ DestRowWidthBytes
,
1834 pDst
+ DestRowWidthBytes
/ 2,
1835 pDst
+ DestRowWidthBytes
+ DestRowWidthBytes
/ 2
1838 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1840 // Raster tile width is same as simd16 tile width
1841 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1843 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1845 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1853 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1854 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1855 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1857 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1858 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
1860 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1861 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
1863 uint32_t rowOffset
= row
* DestRowWidthBytes
;
1865 uint8_t* pRow
= pCol0
+ rowOffset
;
1866 uint8_t* ppDsts
[] = { pRow
, pRow
+ DestRowWidthBytes
};
1868 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1871 ppDsts
[0] += DestRowWidthBytes
/ 2;
1872 ppDsts
[1] += DestRowWidthBytes
/ 2;
1874 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1881 //////////////////////////////////////////////////////////////////////////
1882 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1883 //////////////////////////////////////////////////////////////////////////
1884 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1885 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_XMAJOR
, 32>, SrcFormat
, DstFormat
>
1887 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_XMAJOR
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1888 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1889 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1891 //////////////////////////////////////////////////////////////////////////
1892 /// @brief Stores an 8x8 raster tile to the destination surface.
1893 /// @param pSrc - Pointer to raster tile.
1894 /// @param pDstSurface - Destination surface state
1895 /// @param x, y - Coordinates to raster tile.
1896 INLINE
static void Store(
1898 SWR_SURFACE_STATE
* pDstSurface
,
1899 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1901 static const uint32_t DestRowWidthBytes
= 512; // 512B rows
1903 // Punt non-full tiles to generic store
1904 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1905 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1907 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1909 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1912 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1913 // We can compute the offsets to each column within the raster tile once and increment from these.
1914 #if USE_8x2_TILE_BACKEND
1915 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1916 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1918 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1919 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1923 pDst
, // row 0, col 0
1924 pDst
+ DestRowWidthBytes
, // row 1, col 0
1925 pDst
+ dx
/ 2, // row 0, col 1
1926 pDst
+ DestRowWidthBytes
+ dx
/ 2 // row 1, col 1
1929 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1931 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1933 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1935 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1949 uint8_t *pRow0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1950 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1951 uint8_t* pRow1
= pRow0
+ DestRowWidthBytes
;
1953 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
1955 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
; col
+= SIMD_TILE_X_DIM
)
1957 uint32_t xRowOffset
= col
* (FormatTraits
<DstFormat
>::bpp
/ 8);
1959 uint8_t* ppDsts
[] = { pRow0
+ xRowOffset
, pRow1
+ xRowOffset
};
1960 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1962 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1963 pSrc
+= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
1966 pRow0
+= (DestRowWidthBytes
* 2);
1967 pRow1
+= (DestRowWidthBytes
* 2);
1973 //////////////////////////////////////////////////////////////////////////
1974 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1975 //////////////////////////////////////////////////////////////////////////
1976 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1977 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 32>, SrcFormat
, DstFormat
>
1979 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1980 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1982 //////////////////////////////////////////////////////////////////////////
1983 /// @brief Stores an 8x8 raster tile to the destination surface.
1984 /// @param pSrc - Pointer to raster tile.
1985 /// @param pDstSurface - Destination surface state
1986 /// @param x, y - Coordinates to raster tile.
1987 INLINE
static void Store(
1989 SWR_SURFACE_STATE
* pDstSurface
,
1990 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1992 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1993 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
1995 // Punt non-full tiles to generic store
1996 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1997 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1999 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
2001 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
2004 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2005 // We can compute the offsets to each column within the raster tile once and increment from these.
2006 #if USE_8x2_TILE_BACKEND
2007 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2008 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2009 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2011 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2012 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
2014 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2017 pDst
, // row 0, col 0
2018 pDst
+ DestRowWidthBytes
, // row 1, col 0
2019 pDst
+ DestColumnBytes
, // row 0, col 1
2020 pDst
+ DestRowWidthBytes
+ DestColumnBytes
// row 1, col 1
2023 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
2025 // Raster tile width is same as simd16 tile width
2026 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
2028 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2030 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
2038 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2039 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2040 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2042 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2043 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
2045 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2046 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
2048 uint32_t rowOffset
= row
* DestRowWidthBytes
;
2050 uint8_t* pRow
= pCol0
+ rowOffset
;
2051 uint8_t* ppDsts
[] = { pRow
, pRow
+ DestRowWidthBytes
};
2053 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2056 ppDsts
[0] += DestColumnBytes
;
2057 ppDsts
[1] += DestColumnBytes
;
2059 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2066 //////////////////////////////////////////////////////////////////////////
2067 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2068 //////////////////////////////////////////////////////////////////////////
2069 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
2070 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 64>, SrcFormat
, DstFormat
>
2072 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 64>, SrcFormat
, DstFormat
> GenericStoreTile
;
2073 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
2075 //////////////////////////////////////////////////////////////////////////
2076 /// @brief Stores an 8x8 raster tile to the destination surface.
2077 /// @param pSrc - Pointer to raster tile.
2078 /// @param pDstSurface - Destination surface state
2079 /// @param x, y - Coordinates to raster tile.
2080 INLINE
static void Store(
2082 SWR_SURFACE_STATE
* pDstSurface
,
2083 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
2085 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
2086 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
2088 // Punt non-full tiles to generic store
2089 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
2090 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
2092 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
2094 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
2097 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2098 // We can compute the offsets to each column within the raster tile once and increment from these.
2099 #if USE_8x2_TILE_BACKEND
2100 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2101 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2102 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2104 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2105 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
2107 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2110 pDst
, // row 0, col 0
2111 pDst
+ DestRowWidthBytes
, // row 1, col 0
2112 pDst
+ DestColumnBytes
, // row 0, col 1
2113 pDst
+ DestRowWidthBytes
+ DestColumnBytes
, // row 1, col 1
2114 pDst
+ DestColumnBytes
* 2, // row 0, col 2
2115 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 2, // row 1, col 2
2116 pDst
+ DestColumnBytes
* 3, // row 0, col 3
2117 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 3 // row 1, col 3
2120 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
2122 // Raster tile width is same as simd16 tile width
2123 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
2125 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2127 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
2129 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
2135 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2136 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2137 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2138 uint8_t* pCol1
= pCol0
+ DestColumnBytes
;
2140 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2141 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2142 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
2144 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2145 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
2147 uint32_t rowOffset
= row
* DestRowWidthBytes
;
2151 pCol0
+ rowOffset
+ DestRowWidthBytes
,
2153 pCol1
+ rowOffset
+ DestRowWidthBytes
,
2156 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2159 ppDsts
[0] += DestColumnBytes
* 2;
2160 ppDsts
[1] += DestColumnBytes
* 2;
2161 ppDsts
[2] += DestColumnBytes
* 2;
2162 ppDsts
[3] += DestColumnBytes
* 2;
2164 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2171 //////////////////////////////////////////////////////////////////////////
2172 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2173 //////////////////////////////////////////////////////////////////////////
2174 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
2175 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 128>, SrcFormat
, DstFormat
>
2177 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 128>, SrcFormat
, DstFormat
> GenericStoreTile
;
2178 #if USE_8x2_TILE_BACKEND
2179 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
2182 static const size_t TILE_Y_COL_WIDTH_BYTES
= 16;
2183 static const size_t TILE_Y_ROWS
= 32;
2184 static const size_t TILE_Y_COL_BYTES
= TILE_Y_ROWS
* TILE_Y_COL_WIDTH_BYTES
;
2186 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
2187 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
2188 static const size_t MAX_DST_COLUMN_BYTES
= 16;
2190 static const size_t SRC_COLUMN_BYTES
= KNOB_SIMD_WIDTH
* SRC_BYTES_PER_PIXEL
;
2191 static const size_t DST_COLUMN_BYTES_PER_SRC
= TILE_Y_COL_BYTES
* 4;
2194 //////////////////////////////////////////////////////////////////////////
2195 /// @brief Stores an 8x8 raster tile to the destination surface.
2196 /// @param pSrc - Pointer to raster tile.
2197 /// @param pDstSurface - Destination surface state
2198 /// @param x, y - Coordinates to raster tile.
2199 INLINE
static void Store(
2201 SWR_SURFACE_STATE
* pDstSurface
,
2202 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
2204 #if USE_8x2_TILE_BACKEND
2205 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
2206 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
2209 // Punt non-full tiles to generic store
2210 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
2211 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
2213 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
2215 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
2218 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2219 // We can compute the offsets to each column within the raster tile once and increment from these.
2220 #if USE_8x2_TILE_BACKEND
2221 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2222 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2223 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2225 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2226 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
2228 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2231 pDst
, // row 0, col 0
2232 pDst
+ DestRowWidthBytes
, // row 1, col 0
2233 pDst
+ DestColumnBytes
, // row 0, col 1
2234 pDst
+ DestRowWidthBytes
+ DestColumnBytes
, // row 1, col 1
2235 pDst
+ DestColumnBytes
* 2, // row 0, col 2
2236 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 2, // row 1, col 2
2237 pDst
+ DestColumnBytes
* 3, // row 0, col 3
2238 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 3, // row 1, col 3
2239 pDst
+ DestColumnBytes
* 4, // row 0, col 4
2240 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 4, // row 1, col 4
2241 pDst
+ DestColumnBytes
* 5, // row 0, col 5
2242 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 5, // row 1, col 5
2243 pDst
+ DestColumnBytes
* 6, // row 0, col 6
2244 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 6, // row 1, col 6
2245 pDst
+ DestColumnBytes
* 7, // row 0, col 7
2246 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 7 // row 1, col 7
2249 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
2251 // Raster tile width is same as simd16 tile width
2252 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
2254 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2256 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
2258 for (uint32_t i
= 0; i
< ARRAY_SIZE(ppDsts
); i
+= 1)
2264 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2265 uint8_t* pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2266 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2272 // Need 8 pointers, 4 columns of 2 rows each
2273 for (uint32_t y
= 0; y
< 2; ++y
)
2275 for (uint32_t x
= 0; x
< 4; ++x
)
2277 ptrs
.ppDsts
[x
* 2 + y
] = pDst
+ y
* TILE_Y_COL_WIDTH_BYTES
+ x
* TILE_Y_COL_BYTES
;
2281 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
2283 DstPtrs startPtrs
= ptrs
;
2285 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
2287 // Format conversion and convert from SOA to AOS, and store the rows.
2288 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ptrs
.ppDsts
);
2290 ptrs
.ppDsts
[0] += DST_COLUMN_BYTES_PER_SRC
;
2291 ptrs
.ppDsts
[1] += DST_COLUMN_BYTES_PER_SRC
;
2292 ptrs
.ppDsts
[2] += DST_COLUMN_BYTES_PER_SRC
;
2293 ptrs
.ppDsts
[3] += DST_COLUMN_BYTES_PER_SRC
;
2294 ptrs
.ppDsts
[4] += DST_COLUMN_BYTES_PER_SRC
;
2295 ptrs
.ppDsts
[5] += DST_COLUMN_BYTES_PER_SRC
;
2296 ptrs
.ppDsts
[6] += DST_COLUMN_BYTES_PER_SRC
;
2297 ptrs
.ppDsts
[7] += DST_COLUMN_BYTES_PER_SRC
;
2298 pSrc
+= SRC_COLUMN_BYTES
;
2301 ptrs
.ppDsts
[0] = startPtrs
.ppDsts
[0] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2302 ptrs
.ppDsts
[1] = startPtrs
.ppDsts
[1] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2303 ptrs
.ppDsts
[2] = startPtrs
.ppDsts
[2] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2304 ptrs
.ppDsts
[3] = startPtrs
.ppDsts
[3] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2305 ptrs
.ppDsts
[4] = startPtrs
.ppDsts
[4] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2306 ptrs
.ppDsts
[5] = startPtrs
.ppDsts
[5] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2307 ptrs
.ppDsts
[6] = startPtrs
.ppDsts
[6] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2308 ptrs
.ppDsts
[7] = startPtrs
.ppDsts
[7] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2314 //////////////////////////////////////////////////////////////////////////
2315 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2316 //////////////////////////////////////////////////////////////////////////
2317 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
2318 struct StoreMacroTile
2320 //////////////////////////////////////////////////////////////////////////
2321 /// @brief Stores a macrotile to the destination surface using safe implementation.
2322 /// @param pSrc - Pointer to macro tile.
2323 /// @param pDstSurface - Destination surface state
2324 /// @param x, y - Coordinates to macro tile
2325 static void StoreGeneric(
2326 uint8_t *pSrcHotTile
,
2327 SWR_SURFACE_STATE
* pDstSurface
,
2328 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
)
2330 PFN_STORE_TILES_INTERNAL pfnStore
;
2331 pfnStore
= StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
;
2333 // Store each raster tile from the hot tile to the destination surface.
2334 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
2336 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
2338 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
2340 pfnStore(pSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleNum
, renderTargetArrayIndex
);
2341 pSrcHotTile
+= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
2348 typedef void(*PFN_STORE_TILES_INTERNAL
)(uint8_t*, SWR_SURFACE_STATE
*, uint32_t, uint32_t, uint32_t, uint32_t);
2349 //////////////////////////////////////////////////////////////////////////
2350 /// @brief Stores a macrotile to the destination surface.
2351 /// @param pSrc - Pointer to macro tile.
2352 /// @param pDstSurface - Destination surface state
2353 /// @param x, y - Coordinates to macro tile
2355 uint8_t *pSrcHotTile
,
2356 SWR_SURFACE_STATE
* pDstSurface
,
2357 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
)
2359 PFN_STORE_TILES_INTERNAL pfnStore
[SWR_MAX_NUM_MULTISAMPLES
];
2361 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
2363 size_t dstSurfAddress
= (size_t)ComputeSurfaceAddress
<false, false>(
2366 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, // z for 3D surfaces
2367 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, // array index for 2D arrays
2372 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2373 bool bForceGeneric
= ((pDstSurface
->tileMode
!= SWR_TILE_NONE
) && (0 != (dstSurfAddress
& 0xfff))) ||
2374 (pDstSurface
->bInterleavedSamples
);
2376 pfnStore
[sampleNum
] = (bForceGeneric
|| KNOB_USE_GENERIC_STORETILE
) ? StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
: OptStoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
;
2379 // Save original for pSrcHotTile resolve.
2380 uint8_t *pResolveSrcHotTile
= pSrcHotTile
;
2382 // Store each raster tile from the hot tile to the destination surface.
2383 for(uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
2385 for(uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
2387 for(uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
2389 pfnStore
[sampleNum
](pSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleNum
, renderTargetArrayIndex
);
2390 pSrcHotTile
+= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
2395 if (pDstSurface
->xpAuxBaseAddress
)
2397 uint32_t sampleOffset
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
2398 // Store each raster tile from the hot tile to the destination surface.
2399 for(uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
2401 for(uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
2403 StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Resolve(pResolveSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleOffset
, renderTargetArrayIndex
);
2404 pResolveSrcHotTile
+= sampleOffset
* pDstSurface
->numSamples
;
2411 //////////////////////////////////////////////////////////////////////////
2412 /// InitStoreTilesTable - Helper for setting up the tables.
2413 template <SWR_TILE_MODE TTileMode
, size_t NumTileModesT
, size_t ArraySizeT
>
2414 void InitStoreTilesTableColor_Half1(
2415 PFN_STORE_TILES (&table
)[NumTileModesT
][ArraySizeT
])
2417 table
[TTileMode
][R32G32B32A32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_FLOAT
>::Store
;
2418 table
[TTileMode
][R32G32B32A32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_SINT
>::Store
;
2419 table
[TTileMode
][R32G32B32A32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_UINT
>::Store
;
2420 table
[TTileMode
][R32G32B32X32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32X32_FLOAT
>::Store
;
2421 table
[TTileMode
][R32G32B32A32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_SSCALED
>::Store
;
2422 table
[TTileMode
][R32G32B32A32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_USCALED
>::Store
;
2423 table
[TTileMode
][R32G32B32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_FLOAT
>::Store
;
2424 table
[TTileMode
][R32G32B32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_SINT
>::Store
;
2425 table
[TTileMode
][R32G32B32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_UINT
>::Store
;
2426 table
[TTileMode
][R32G32B32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_SSCALED
>::Store
;
2427 table
[TTileMode
][R32G32B32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_USCALED
>::Store
;
2428 table
[TTileMode
][R16G16B16A16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_UNORM
>::Store
;
2429 table
[TTileMode
][R16G16B16A16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SNORM
>::Store
;
2430 table
[TTileMode
][R16G16B16A16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SINT
>::Store
;
2431 table
[TTileMode
][R16G16B16A16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_UINT
>::Store
;
2432 table
[TTileMode
][R16G16B16A16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_FLOAT
>::Store
;
2433 table
[TTileMode
][R32G32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_FLOAT
>::Store
;
2434 table
[TTileMode
][R32G32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_SINT
>::Store
;
2435 table
[TTileMode
][R32G32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_UINT
>::Store
;
2436 table
[TTileMode
][R32_FLOAT_X8X24_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32_FLOAT_X8X24_TYPELESS
>::Store
;
2437 table
[TTileMode
][X32_TYPELESS_G8X24_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, X32_TYPELESS_G8X24_UINT
>::Store
;
2438 table
[TTileMode
][R16G16B16X16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16X16_UNORM
>::Store
;
2439 table
[TTileMode
][R16G16B16X16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16X16_FLOAT
>::Store
;
2440 table
[TTileMode
][R16G16B16A16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SSCALED
>::Store
;
2441 table
[TTileMode
][R16G16B16A16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_USCALED
>::Store
;
2442 table
[TTileMode
][R32G32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_SSCALED
>::Store
;
2443 table
[TTileMode
][R32G32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_USCALED
>::Store
;
2444 table
[TTileMode
][B8G8R8A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8A8_UNORM
>::Store
;
2445 table
[TTileMode
][B8G8R8A8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8A8_UNORM_SRGB
>::Store
;
2446 table
[TTileMode
][R10G10B10A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UNORM
>::StoreGeneric
;
2447 table
[TTileMode
][R10G10B10A2_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UNORM_SRGB
>::StoreGeneric
;
2448 table
[TTileMode
][R10G10B10A2_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UINT
>::StoreGeneric
;
2449 table
[TTileMode
][R8G8B8A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UNORM
>::Store
;
2450 table
[TTileMode
][R8G8B8A8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UNORM_SRGB
>::Store
;
2451 table
[TTileMode
][R8G8B8A8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SNORM
>::Store
;
2452 table
[TTileMode
][R8G8B8A8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SINT
>::Store
;
2453 table
[TTileMode
][R8G8B8A8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UINT
>::Store
;
2454 table
[TTileMode
][R16G16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_UNORM
>::Store
;
2455 table
[TTileMode
][R16G16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SNORM
>::Store
;
2456 table
[TTileMode
][R16G16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SINT
>::Store
;
2457 table
[TTileMode
][R16G16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_UINT
>::Store
;
2458 table
[TTileMode
][R16G16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_FLOAT
>::Store
;
2459 table
[TTileMode
][B10G10R10A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UNORM
>::StoreGeneric
;
2460 table
[TTileMode
][B10G10R10A2_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UNORM_SRGB
>::StoreGeneric
;
2461 table
[TTileMode
][R11G11B10_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R11G11B10_FLOAT
>::StoreGeneric
;
2462 table
[TTileMode
][R10G10B10_FLOAT_A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10_FLOAT_A2_UNORM
>::StoreGeneric
;
2463 table
[TTileMode
][R32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_SINT
>::Store
;
2464 table
[TTileMode
][R32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_UINT
>::Store
;
2465 table
[TTileMode
][R32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_FLOAT
>::Store
;
2466 table
[TTileMode
][R24_UNORM_X8_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R24_UNORM_X8_TYPELESS
>::StoreGeneric
;
2467 table
[TTileMode
][X24_TYPELESS_G8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, X24_TYPELESS_G8_UINT
>::StoreGeneric
;
2468 table
[TTileMode
][A32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, A32_FLOAT
>::Store
;
2469 table
[TTileMode
][B8G8R8X8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8X8_UNORM
>::Store
;
2470 table
[TTileMode
][B8G8R8X8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8X8_UNORM_SRGB
>::Store
;
2471 table
[TTileMode
][R8G8B8X8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8X8_UNORM
>::Store
;
2472 table
[TTileMode
][R8G8B8X8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8X8_UNORM_SRGB
>::Store
;
2475 template <SWR_TILE_MODE TTileMode
, size_t NumTileModesT
, size_t ArraySizeT
>
2476 void InitStoreTilesTableColor_Half2(
2477 PFN_STORE_TILES(&table
)[NumTileModesT
][ArraySizeT
])
2479 table
[TTileMode
][R9G9B9E5_SHAREDEXP
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R9G9B9E5_SHAREDEXP
>::StoreGeneric
;
2480 table
[TTileMode
][B10G10R10X2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10X2_UNORM
>::StoreGeneric
;
2481 table
[TTileMode
][R10G10B10X2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10X2_USCALED
>::StoreGeneric
;
2482 table
[TTileMode
][R8G8B8A8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SSCALED
>::Store
;
2483 table
[TTileMode
][R8G8B8A8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_USCALED
>::Store
;
2484 table
[TTileMode
][R16G16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SSCALED
>::Store
;
2485 table
[TTileMode
][R16G16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_USCALED
>::Store
;
2486 table
[TTileMode
][R32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_SSCALED
>::Store
;
2487 table
[TTileMode
][R32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_USCALED
>::Store
;
2488 table
[TTileMode
][B5G6R5_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G6R5_UNORM
>::Store
;
2489 table
[TTileMode
][B5G6R5_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G6R5_UNORM_SRGB
>::StoreGeneric
;
2490 table
[TTileMode
][B5G5R5A1_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5A1_UNORM
>::StoreGeneric
;
2491 table
[TTileMode
][B5G5R5A1_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5A1_UNORM_SRGB
>::StoreGeneric
;
2492 table
[TTileMode
][B4G4R4A4_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B4G4R4A4_UNORM
>::StoreGeneric
;
2493 table
[TTileMode
][B4G4R4A4_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B4G4R4A4_UNORM_SRGB
>::StoreGeneric
;
2494 table
[TTileMode
][R8G8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_UNORM
>::Store
;
2495 table
[TTileMode
][R8G8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SNORM
>::Store
;
2496 table
[TTileMode
][R8G8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SINT
>::Store
;
2497 table
[TTileMode
][R8G8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_UINT
>::Store
;
2498 table
[TTileMode
][R16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_UNORM
>::Store
;
2499 table
[TTileMode
][R16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SNORM
>::Store
;
2500 table
[TTileMode
][R16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SINT
>::Store
;
2501 table
[TTileMode
][R16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_UINT
>::Store
;
2502 table
[TTileMode
][R16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_FLOAT
>::Store
;
2503 table
[TTileMode
][A16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A16_UNORM
>::Store
;
2504 table
[TTileMode
][A16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A16_FLOAT
>::Store
;
2505 table
[TTileMode
][B5G5R5X1_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5X1_UNORM
>::StoreGeneric
;
2506 table
[TTileMode
][B5G5R5X1_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5X1_UNORM_SRGB
>::StoreGeneric
;
2507 table
[TTileMode
][R8G8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SSCALED
>::Store
;
2508 table
[TTileMode
][R8G8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_USCALED
>::Store
;
2509 table
[TTileMode
][R16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SSCALED
>::Store
;
2510 table
[TTileMode
][R16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_USCALED
>::Store
;
2511 table
[TTileMode
][A1B5G5R5_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A1B5G5R5_UNORM
>::StoreGeneric
;
2512 table
[TTileMode
][A4B4G4R4_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A4B4G4R4_UNORM
>::StoreGeneric
;
2513 table
[TTileMode
][R8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_UNORM
>::Store
;
2514 table
[TTileMode
][R8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SNORM
>::Store
;
2515 table
[TTileMode
][R8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SINT
>::Store
;
2516 table
[TTileMode
][R8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_UINT
>::Store
;
2517 table
[TTileMode
][A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, A8_UNORM
>::Store
;
2518 table
[TTileMode
][R8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SSCALED
>::Store
;
2519 table
[TTileMode
][R8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_USCALED
>::Store
;
2520 table
[TTileMode
][R8G8B8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UNORM
>::Store
;
2521 table
[TTileMode
][R8G8B8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SNORM
>::Store
;
2522 table
[TTileMode
][R8G8B8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SSCALED
>::Store
;
2523 table
[TTileMode
][R8G8B8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_USCALED
>::Store
;
2524 table
[TTileMode
][R16G16B16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_FLOAT
>::Store
;
2525 table
[TTileMode
][R16G16B16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_UNORM
>::Store
;
2526 table
[TTileMode
][R16G16B16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SNORM
>::Store
;
2527 table
[TTileMode
][R16G16B16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SSCALED
>::Store
;
2528 table
[TTileMode
][R16G16B16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_USCALED
>::Store
;
2529 table
[TTileMode
][R8G8B8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UNORM_SRGB
>::Store
;
2530 table
[TTileMode
][R16G16B16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_UINT
>::Store
;
2531 table
[TTileMode
][R16G16B16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SINT
>::Store
;
2532 table
[TTileMode
][R10G10B10A2_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SNORM
>::StoreGeneric
;
2533 table
[TTileMode
][R10G10B10A2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_USCALED
>::StoreGeneric
;
2534 table
[TTileMode
][R10G10B10A2_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SSCALED
>::StoreGeneric
;
2535 table
[TTileMode
][R10G10B10A2_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SINT
>::StoreGeneric
;
2536 table
[TTileMode
][B10G10R10A2_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SNORM
>::StoreGeneric
;
2537 table
[TTileMode
][B10G10R10A2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_USCALED
>::StoreGeneric
;
2538 table
[TTileMode
][B10G10R10A2_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SSCALED
>::StoreGeneric
;
2539 table
[TTileMode
][B10G10R10A2_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UINT
>::StoreGeneric
;
2540 table
[TTileMode
][B10G10R10A2_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SINT
>::StoreGeneric
;
2541 table
[TTileMode
][R8G8B8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UINT
>::Store
;
2542 table
[TTileMode
][R8G8B8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SINT
>::Store
;
2545 //////////////////////////////////////////////////////////////////////////
2546 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2547 template <SWR_TILE_MODE TTileMode
, size_t NumTileModes
, size_t ArraySizeT
>
2548 void InitStoreTilesTableDepth(
2549 PFN_STORE_TILES(&table
)[NumTileModes
][ArraySizeT
])
2551 table
[TTileMode
][R32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32_FLOAT
, R32_FLOAT
>::Store
;
2552 table
[TTileMode
][R32_FLOAT_X8X24_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32_FLOAT
, R32_FLOAT_X8X24_TYPELESS
>::Store
;
2553 table
[TTileMode
][R24_UNORM_X8_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32_FLOAT
, R24_UNORM_X8_TYPELESS
>::Store
;
2554 table
[TTileMode
][R16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32_FLOAT
, R16_UNORM
>::Store
;
2557 template <SWR_TILE_MODE TTileMode
, size_t NumTileModes
, size_t ArraySizeT
>
2558 void InitStoreTilesTableStencil(
2559 PFN_STORE_TILES(&table
)[NumTileModes
][ArraySizeT
])
2561 table
[TTileMode
][R8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R8_UINT
, R8_UINT
>::Store
;