1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Functionality for Store.
27 ******************************************************************************/
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44 typedef void(*PFN_STORE_TILES
)(uint8_t*, SWR_SURFACE_STATE
*, uint32_t, uint32_t, uint32_t);
46 //////////////////////////////////////////////////////////////////////////
47 /// Store Raster Tile Function Tables.
48 //////////////////////////////////////////////////////////////////////////
49 extern PFN_STORE_TILES sStoreTilesTableColor
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
50 extern PFN_STORE_TILES sStoreTilesTableDepth
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
51 extern PFN_STORE_TILES sStoreTilesTableStencil
[SWR_TILE_MODE_COUNT
][NUM_SWR_FORMATS
];
53 void InitStoreTilesTable_Linear_1();
54 void InitStoreTilesTable_Linear_2();
55 void InitStoreTilesTable_TileX_1();
56 void InitStoreTilesTable_TileX_2();
57 void InitStoreTilesTable_TileY_1();
58 void InitStoreTilesTable_TileY_2();
59 void InitStoreTilesTable_TileW();
60 void InitStoreTilesTable();
62 //////////////////////////////////////////////////////////////////////////
64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
66 /// @param ppDsts - Array of destination pointers. Each pointer is
67 /// to a single row of at most 16B.
68 /// @tparam NumDests - Number of destination pointers. Each pair of
69 /// pointers is for a 16-byte column of two rows.
70 //////////////////////////////////////////////////////////////////////////
71 template <size_t PixelSize
, size_t NumDests
>
74 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
]) = delete;
77 //////////////////////////////////////////////////////////////////////////
78 /// StorePixels (32-bit pixel specialization)
79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
81 /// @param ppDsts - Array of destination pointers. Each pointer is
82 /// to a single row of at most 16B.
83 /// @tparam NumDests - Number of destination pointers. Each pair of
84 /// pointers is for a 16-byte column of two rows.
85 //////////////////////////////////////////////////////////////////////////
87 struct StorePixels
<8, 2>
89 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
91 // Each 4-pixel row is 4 bytes.
92 const uint16_t* pPixSrc
= (const uint16_t*)pSrc
;
94 // Unswizzle from SWR-Z order
95 uint16_t* pRow
= (uint16_t*)ppDsts
[0];
99 pRow
= (uint16_t*)ppDsts
[1];
100 pRow
[0] = pPixSrc
[1];
101 pRow
[1] = pPixSrc
[3];
105 #if USE_8x2_TILE_BACKEND
107 struct StorePixels
<8, 4>
109 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
111 // 8 x 2 bytes = 16 bytes, 16 pixels
112 const uint16_t *pSrc16
= reinterpret_cast<const uint16_t *>(pSrc
);
114 uint16_t **ppDsts16
= reinterpret_cast<uint16_t **>(ppDsts
);
116 // Unswizzle from SWR-Z order
117 ppDsts16
[0][0] = pSrc16
[0]; // 0 1
118 ppDsts16
[0][1] = pSrc16
[2]; // 4 5
120 ppDsts16
[1][0] = pSrc16
[1]; // 2 3
121 ppDsts16
[1][1] = pSrc16
[3]; // 6 7
123 ppDsts16
[2][0] = pSrc16
[4]; // 8 9
124 ppDsts16
[2][1] = pSrc16
[6]; // C D
126 ppDsts16
[3][0] = pSrc16
[5]; // A B
127 ppDsts16
[3][1] = pSrc16
[7]; // E F
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts - Array of destination pointers. Each pointer is
137 /// to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers. Each pair of
139 /// pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
142 struct StorePixels
<16, 2>
144 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
146 // Each 4-pixel row is 8 bytes.
147 const uint32_t* pPixSrc
= (const uint32_t*)pSrc
;
149 // Unswizzle from SWR-Z order
150 uint32_t* pRow
= (uint32_t*)ppDsts
[0];
151 pRow
[0] = pPixSrc
[0];
152 pRow
[1] = pPixSrc
[2];
154 pRow
= (uint32_t*)ppDsts
[1];
155 pRow
[0] = pPixSrc
[1];
156 pRow
[1] = pPixSrc
[3];
160 #if USE_8x2_TILE_BACKEND
162 struct StorePixels
<16, 4>
164 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32
= reinterpret_cast<const uint32_t *>(pSrc
);
169 uint32_t **ppDsts32
= reinterpret_cast<uint32_t **>(ppDsts
);
171 // Unswizzle from SWR-Z order
172 ppDsts32
[0][0] = pSrc32
[0]; // 0 1
173 ppDsts32
[0][1] = pSrc32
[2]; // 4 5
175 ppDsts32
[1][0] = pSrc32
[1]; // 2 3
176 ppDsts32
[1][1] = pSrc32
[3]; // 6 7
178 ppDsts32
[2][0] = pSrc32
[4]; // 8 9
179 ppDsts32
[2][1] = pSrc32
[6]; // C D
181 ppDsts32
[3][0] = pSrc32
[5]; // A B
182 ppDsts32
[3][1] = pSrc32
[7]; // E F
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts - Array of destination pointers. Each pointer is
192 /// to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers. Each pair of
194 /// pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
197 struct StorePixels
<32, 2>
199 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[2])
201 // Each 4-pixel row is 16-bytes
202 __m128i
*pZRow01
= (__m128i
*)pSrc
;
203 __m128i vQuad00
= _mm_load_si128(pZRow01
);
204 __m128i vQuad01
= _mm_load_si128(pZRow01
+ 1);
206 __m128i vRow00
= _mm_unpacklo_epi64(vQuad00
, vQuad01
);
207 __m128i vRow10
= _mm_unpackhi_epi64(vQuad00
, vQuad01
);
209 _mm_storeu_si128((__m128i
*)ppDsts
[0], vRow00
);
210 _mm_storeu_si128((__m128i
*)ppDsts
[1], vRow10
);
214 #if USE_8x2_TILE_BACKEND
216 struct StorePixels
<32, 4>
218 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
220 // 4 x 16 bytes = 64 bytes, 16 pixels
221 const __m128i
*pSrc128
= reinterpret_cast<const __m128i
*>(pSrc
);
223 __m128i
**ppDsts128
= reinterpret_cast<__m128i
**>(ppDsts
);
225 // Unswizzle from SWR-Z order
226 __m128i quad0
= _mm_load_si128(&pSrc128
[0]); // 0 1 2 3
227 __m128i quad1
= _mm_load_si128(&pSrc128
[1]); // 4 5 6 7
228 __m128i quad2
= _mm_load_si128(&pSrc128
[2]); // 8 9 A B
229 __m128i quad3
= _mm_load_si128(&pSrc128
[3]); // C D E F
231 _mm_storeu_si128(ppDsts128
[0], _mm_unpacklo_epi64(quad0
, quad1
)); // 0 1 4 5
232 _mm_storeu_si128(ppDsts128
[1], _mm_unpackhi_epi64(quad0
, quad1
)); // 2 3 6 7
233 _mm_storeu_si128(ppDsts128
[2], _mm_unpacklo_epi64(quad2
, quad3
)); // 8 9 C D
234 _mm_storeu_si128(ppDsts128
[3], _mm_unpackhi_epi64(quad2
, quad3
)); // A B E F
239 //////////////////////////////////////////////////////////////////////////
240 /// StorePixels (32-bit pixel specialization)
241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
243 /// @param ppDsts - Array of destination pointers. Each pointer is
244 /// to a single row of at most 16B.
245 /// @tparam NumDests - Number of destination pointers. Each pair of
246 /// pointers is for a 16-byte column of two rows.
247 //////////////////////////////////////////////////////////////////////////
249 struct StorePixels
<64, 4>
251 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[4])
253 // Each 4-pixel row is 32 bytes.
254 const __m128i
* pPixSrc
= (const __m128i
*)pSrc
;
256 // order of pointers match SWR-Z layout
257 __m128i
** pvDsts
= (__m128i
**)&ppDsts
[0];
258 *pvDsts
[0] = pPixSrc
[0];
259 *pvDsts
[1] = pPixSrc
[1];
260 *pvDsts
[2] = pPixSrc
[2];
261 *pvDsts
[3] = pPixSrc
[3];
265 #if USE_8x2_TILE_BACKEND
267 struct StorePixels
<64, 8>
269 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[8])
271 // 8 x 16 bytes = 128 bytes, 16 pixels
272 const __m128i
*pSrc128
= reinterpret_cast<const __m128i
*>(pSrc
);
274 __m128i
**ppDsts128
= reinterpret_cast<__m128i
**>(ppDsts
);
276 // order of pointers match SWR-Z layout
277 *ppDsts128
[0] = pSrc128
[0]; // 0 1
278 *ppDsts128
[1] = pSrc128
[1]; // 2 3
279 *ppDsts128
[2] = pSrc128
[2]; // 4 5
280 *ppDsts128
[3] = pSrc128
[3]; // 6 7
281 *ppDsts128
[4] = pSrc128
[4]; // 8 9
282 *ppDsts128
[5] = pSrc128
[5]; // A B
283 *ppDsts128
[6] = pSrc128
[6]; // C D
284 *ppDsts128
[7] = pSrc128
[7]; // E F
289 //////////////////////////////////////////////////////////////////////////
290 /// StorePixels (32-bit pixel specialization)
291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
293 /// @param ppDsts - Array of destination pointers. Each pointer is
294 /// to a single row of at most 16B.
295 /// @tparam NumDests - Number of destination pointers. Each pair of
296 /// pointers is for a 16-byte column of two rows.
297 //////////////////////////////////////////////////////////////////////////
299 struct StorePixels
<128, 8>
301 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[8])
303 // Each 4-pixel row is 64 bytes.
304 const __m128i
* pPixSrc
= (const __m128i
*)pSrc
;
306 // Unswizzle from SWR-Z order
307 __m128i
** pvDsts
= (__m128i
**)&ppDsts
[0];
308 *pvDsts
[0] = pPixSrc
[0];
309 *pvDsts
[1] = pPixSrc
[2];
310 *pvDsts
[2] = pPixSrc
[1];
311 *pvDsts
[3] = pPixSrc
[3];
312 *pvDsts
[4] = pPixSrc
[4];
313 *pvDsts
[5] = pPixSrc
[6];
314 *pvDsts
[6] = pPixSrc
[5];
315 *pvDsts
[7] = pPixSrc
[7];
319 #if USE_8x2_TILE_BACKEND
321 struct StorePixels
<128, 16>
323 static void Store(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[16])
325 // 16 x 16 bytes = 256 bytes, 16 pixels
326 const __m128i
*pSrc128
= reinterpret_cast<const __m128i
*>(pSrc
);
328 __m128i
**ppDsts128
= reinterpret_cast<__m128i
**>(ppDsts
);
330 for (uint32_t i
= 0; i
< 16; i
+= 4)
332 *ppDsts128
[i
+ 0] = pSrc128
[i
+ 0];
333 *ppDsts128
[i
+ 1] = pSrc128
[i
+ 2];
334 *ppDsts128
[i
+ 2] = pSrc128
[i
+ 1];
335 *ppDsts128
[i
+ 3] = pSrc128
[i
+ 3];
341 //////////////////////////////////////////////////////////////////////////
342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343 //////////////////////////////////////////////////////////////////////////
344 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
345 struct ConvertPixelsSOAtoAOS
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Converts a SIMD from the Hot Tile to the destination format
349 /// and converts from SOA to AOS.
350 /// @param pSrc - Pointer to raster tile.
351 /// @param pDst - Pointer to destination surface or deswizzling buffer.
352 template <size_t NumDests
>
353 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
355 #if USE_8x2_TILE_BACKEND
356 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
358 OSALIGNSIMD16(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
359 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
361 // Convert from SrcFormat --> DstFormat
363 LoadSOA
<SrcFormat
>(pSrc
, src
);
364 StoreSOA
<DstFormat
>(src
, soaTile
);
366 // Convert from SOA --> AOS
367 FormatTraits
<DstFormat
>::TransposeT::Transpose_16(soaTile
, aosTile
);
370 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
372 OSALIGNSIMD(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
373 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
375 // Convert from SrcFormat --> DstFormat
377 LoadSOA
<SrcFormat
>(pSrc
, src
);
378 StoreSOA
<DstFormat
>(src
, soaTile
);
380 // Convert from SOA --> AOS
381 FormatTraits
<DstFormat
>::TransposeT::Transpose(soaTile
, aosTile
);
384 // Store data into destination
385 StorePixels
<FormatTraits
<DstFormat
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
389 //////////////////////////////////////////////////////////////////////////
390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391 /// Specialization for no format conversion
392 //////////////////////////////////////////////////////////////////////////
393 template<SWR_FORMAT Format
>
394 struct ConvertPixelsSOAtoAOS
<Format
, Format
>
396 //////////////////////////////////////////////////////////////////////////
397 /// @brief Converts a SIMD from the Hot Tile to the destination format
398 /// and converts from SOA to AOS.
399 /// @param pSrc - Pointer to raster tile.
400 /// @param pDst - Pointer to destination surface or deswizzling buffer.
401 template <size_t NumDests
>
402 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
404 #if USE_8x2_TILE_BACKEND
405 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
407 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
409 // Convert from SOA --> AOS
410 FormatTraits
<Format
>::TransposeT::Transpose_16(pSrc
, aosTile
);
413 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
415 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
417 // Convert from SOA --> AOS
418 FormatTraits
<Format
>::TransposeT::Transpose(pSrc
, aosTile
);
421 // Store data into destination
422 StorePixels
<FormatTraits
<Format
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
426 //////////////////////////////////////////////////////////////////////////
427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428 //////////////////////////////////////////////////////////////////////////
430 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B5G6R5_UNORM
>
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Converts a SIMD from the Hot Tile to the destination format
434 /// and converts from SOA to AOS.
435 /// @param pSrc - Pointer to raster tile.
436 /// @param pDst - Pointer to destination surface or deswizzling buffer.
437 template <size_t NumDests
>
438 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
440 #if USE_8x2_TILE_BACKEND
441 static const SWR_FORMAT SrcFormat
= R32G32B32A32_FLOAT
;
442 static const SWR_FORMAT DstFormat
= B5G6R5_UNORM
;
444 static const uint32_t MAX_RASTER_TILE_BYTES
= 16 * 16; // 16 pixels * 16 bytes per pixel
446 OSALIGNSIMD16(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
449 simd16vector src
, dst
;
450 LoadSOA
<SrcFormat
>(pSrc
, src
);
453 dst
.x
= src
[FormatTraits
<DstFormat
>::swizzle(0)];
454 dst
.y
= src
[FormatTraits
<DstFormat
>::swizzle(1)];
455 dst
.z
= src
[FormatTraits
<DstFormat
>::swizzle(2)];
458 dst
.x
= Clamp
<DstFormat
>(dst
.x
, 0);
459 dst
.y
= Clamp
<DstFormat
>(dst
.y
, 1);
460 dst
.z
= Clamp
<DstFormat
>(dst
.z
, 2);
463 dst
.x
= Normalize
<DstFormat
>(dst
.x
, 0);
464 dst
.y
= Normalize
<DstFormat
>(dst
.y
, 1);
465 dst
.z
= Normalize
<DstFormat
>(dst
.z
, 2);
468 simd16scalari packed
= _simd16_castps_si(dst
.x
);
470 SWR_ASSERT(FormatTraits
<DstFormat
>::GetBPC(0) == 5);
471 SWR_ASSERT(FormatTraits
<DstFormat
>::GetBPC(1) == 6);
473 packed
= _simd16_or_si(packed
, _simd16_slli_epi32(_simd16_castps_si(dst
.y
), 5));
474 packed
= _simd16_or_si(packed
, _simd16_slli_epi32(_simd16_castps_si(dst
.z
), 5 + 6));
476 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477 uint32_t *pPacked
= (uint32_t*)&packed
;
478 uint16_t *pAosTile
= (uint16_t*)&aosTile
[0];
479 for (uint32_t t
= 0; t
< KNOB_SIMD16_WIDTH
; ++t
)
481 *pAosTile
++ = *pPacked
++;
485 static const SWR_FORMAT SrcFormat
= R32G32B32A32_FLOAT
;
486 static const SWR_FORMAT DstFormat
= B5G6R5_UNORM
;
487 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
489 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
493 LoadSOA
<SrcFormat
>(pSrc
, src
);
496 dst
.x
= src
[FormatTraits
<DstFormat
>::swizzle(0)];
497 dst
.y
= src
[FormatTraits
<DstFormat
>::swizzle(1)];
498 dst
.z
= src
[FormatTraits
<DstFormat
>::swizzle(2)];
501 dst
.x
= Clamp
<DstFormat
>(dst
.x
, 0);
502 dst
.y
= Clamp
<DstFormat
>(dst
.y
, 1);
503 dst
.z
= Clamp
<DstFormat
>(dst
.z
, 2);
506 dst
.x
= Normalize
<DstFormat
>(dst
.x
, 0);
507 dst
.y
= Normalize
<DstFormat
>(dst
.y
, 1);
508 dst
.z
= Normalize
<DstFormat
>(dst
.z
, 2);
511 simdscalari packed
= _simd_castps_si(dst
.x
);
512 packed
= _simd_or_si(packed
, _simd_slli_epi32(_simd_castps_si(dst
.y
), FormatTraits
<DstFormat
>::GetConstBPC(0)));
513 packed
= _simd_or_si(packed
, _simd_slli_epi32(_simd_castps_si(dst
.z
), FormatTraits
<DstFormat
>::GetConstBPC(0) +
514 FormatTraits
<DstFormat
>::GetConstBPC(1)));
516 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517 uint32_t *pPacked
= (uint32_t*)&packed
;
518 uint16_t *pAosTile
= (uint16_t*)&aosTile
[0];
519 for (uint32_t t
= 0; t
< KNOB_SIMD_WIDTH
; ++t
)
521 *pAosTile
++ = *pPacked
++;
525 // Store data into destination
526 StorePixels
<FormatTraits
<DstFormat
>::bpp
, NumDests
>::Store(aosTile
, ppDsts
);
530 //////////////////////////////////////////////////////////////////////////
531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532 //////////////////////////////////////////////////////////////////////////
534 struct ConvertPixelsSOAtoAOS
<R32_FLOAT
, R24_UNORM_X8_TYPELESS
>
536 static const SWR_FORMAT SrcFormat
= R32_FLOAT
;
537 static const SWR_FORMAT DstFormat
= R24_UNORM_X8_TYPELESS
;
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Converts a SIMD from the Hot Tile to the destination format
541 /// and converts from SOA to AOS.
542 /// @param pSrc - Pointer to raster tile.
543 /// @param pDst - Pointer to destination surface or deswizzling buffer.
544 template <size_t NumDests
>
545 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
547 #if USE_8x2_TILE_BACKEND
548 simd16scalar comp
= _simd16_load_ps(reinterpret_cast<const float *>(pSrc
));
551 const simd16scalar zero
= _simd16_setzero_ps();
552 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
554 comp
= _simd16_max_ps(comp
, zero
);
555 comp
= _simd16_min_ps(comp
, ones
);
558 comp
= _simd16_mul_ps(comp
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
560 simd16scalari temp
= _simd16_cvtps_epi32(comp
);
563 temp
= _simd16_permute_epi32(temp
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
565 // merge/store data into destination but don't overwrite the X8 bits
566 simdscalari destlo
= _simd_loadu2_si(reinterpret_cast<__m128i
*>(ppDsts
[1]), reinterpret_cast<__m128i
*>(ppDsts
[0]));
567 simdscalari desthi
= _simd_loadu2_si(reinterpret_cast<__m128i
*>(ppDsts
[3]), reinterpret_cast<__m128i
*>(ppDsts
[2]));
569 simd16scalari dest
= _simd16_setzero_si();
571 dest
= _simd16_insert_si(dest
, destlo
, 0);
572 dest
= _simd16_insert_si(dest
, desthi
, 1);
574 simd16scalari mask
= _simd16_set1_epi32(0x00FFFFFF);
576 dest
= _simd16_or_si(_simd16_andnot_si(mask
, dest
), _simd16_and_si(mask
, temp
));
578 _simd_storeu2_si(reinterpret_cast<__m128i
*>(ppDsts
[1]), reinterpret_cast<__m128i
*>(ppDsts
[0]), _simd16_extract_si(dest
, 0));
579 _simd_storeu2_si(reinterpret_cast<__m128i
*>(ppDsts
[3]), reinterpret_cast<__m128i
*>(ppDsts
[2]), _simd16_extract_si(dest
, 1));
581 static const uint32_t MAX_RASTER_TILE_BYTES
= 128; // 8 pixels * 16 bytes per pixel
583 OSALIGNSIMD(uint8_t) soaTile
[MAX_RASTER_TILE_BYTES
];
584 OSALIGNSIMD(uint8_t) aosTile
[MAX_RASTER_TILE_BYTES
];
586 // Convert from SrcFormat --> DstFormat
588 LoadSOA
<SrcFormat
>(pSrc
, src
);
589 StoreSOA
<DstFormat
>(src
, soaTile
);
591 // Convert from SOA --> AOS
592 FormatTraits
<DstFormat
>::TransposeT::Transpose(soaTile
, aosTile
);
594 // Store data into destination but don't overwrite the X8 bits
595 // Each 4-pixel row is 16-bytes
596 __m128i
*pZRow01
= (__m128i
*)aosTile
;
597 __m128i vQuad00
= _mm_load_si128(pZRow01
);
598 __m128i vQuad01
= _mm_load_si128(pZRow01
+ 1);
600 __m128i vRow00
= _mm_unpacklo_epi64(vQuad00
, vQuad01
);
601 __m128i vRow10
= _mm_unpackhi_epi64(vQuad00
, vQuad01
);
603 __m128i vDst0
= _mm_loadu_si128((const __m128i
*)ppDsts
[0]);
604 __m128i vDst1
= _mm_loadu_si128((const __m128i
*)ppDsts
[1]);
606 __m128i vMask
= _mm_set1_epi32(0xFFFFFF);
608 vDst0
= _mm_andnot_si128(vMask
, vDst0
);
609 vDst0
= _mm_or_si128(vDst0
, _mm_and_si128(vRow00
, vMask
));
610 vDst1
= _mm_andnot_si128(vMask
, vDst1
);
611 vDst1
= _mm_or_si128(vDst1
, _mm_and_si128(vRow10
, vMask
));
613 _mm_storeu_si128((__m128i
*)ppDsts
[0], vDst0
);
614 _mm_storeu_si128((__m128i
*)ppDsts
[1], vDst1
);
619 #if USE_8x2_TILE_BACKEND
620 template<SWR_FORMAT DstFormat
>
621 INLINE
static void FlatConvert(const uint8_t* pSrc
, uint8_t* pDst0
, uint8_t* pDst1
, uint8_t* pDst2
, uint8_t* pDst3
)
623 // swizzle rgba -> bgra while we load
624 simd16scalar comp0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(0) * sizeof(simd16scalar
))); // float32 rrrrrrrrrrrrrrrr
625 simd16scalar comp1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(1) * sizeof(simd16scalar
))); // float32 gggggggggggggggg
626 simd16scalar comp2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(2) * sizeof(simd16scalar
))); // float32 bbbbbbbbbbbbbbbb
627 simd16scalar comp3
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(3) * sizeof(simd16scalar
))); // float32 aaaaaaaaaaaaaaaa
630 const simd16scalar zero
= _simd16_setzero_ps();
631 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
633 comp0
= _simd16_max_ps(comp0
, zero
);
634 comp0
= _simd16_min_ps(comp0
, ones
);
636 comp1
= _simd16_max_ps(comp1
, zero
);
637 comp1
= _simd16_min_ps(comp1
, ones
);
639 comp2
= _simd16_max_ps(comp2
, zero
);
640 comp2
= _simd16_min_ps(comp2
, ones
);
642 comp3
= _simd16_max_ps(comp3
, zero
);
643 comp3
= _simd16_min_ps(comp3
, ones
);
645 // gamma-correct only rgb
646 if (FormatTraits
<DstFormat
>::isSRGB
)
648 comp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, comp0
);
649 comp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, comp1
);
650 comp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, comp2
);
653 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654 comp0
= _simd16_mul_ps(comp0
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
655 comp1
= _simd16_mul_ps(comp1
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
656 comp2
= _simd16_mul_ps(comp2
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
657 comp3
= _simd16_mul_ps(comp3
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(3)));
659 // moving to 16 wide integer vector types
660 simd16scalari src0
= _simd16_cvtps_epi32(comp0
); // padded byte rrrrrrrrrrrrrrrr
661 simd16scalari src1
= _simd16_cvtps_epi32(comp1
); // padded byte gggggggggggggggg
662 simd16scalari src2
= _simd16_cvtps_epi32(comp2
); // padded byte bbbbbbbbbbbbbbbb
663 simd16scalari src3
= _simd16_cvtps_epi32(comp3
); // padded byte aaaaaaaaaaaaaaaa
665 // SOA to AOS conversion
666 src1
= _simd16_slli_epi32(src1
, 8);
667 src2
= _simd16_slli_epi32(src2
, 16);
668 src3
= _simd16_slli_epi32(src3
, 24);
670 simd16scalari final
= _simd16_or_si(_simd16_or_si(src0
, src1
), _simd16_or_si(src2
, src3
)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
672 // de-swizzle conversion
674 simd16scalari final0
= _simd16_permute2f128_si(final
, final
, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675 simd16scalari final1
= _simd16_permute2f128_si(final
, final
, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
677 final
= _simd16_shuffle_epi64(final0
, final1
, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
680 final
= _simd16_permute_epi32(final
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
683 // store 8x2 memory order:
684 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686 _simd_storeu2_si(reinterpret_cast<__m128i
*>(pDst1
), reinterpret_cast<__m128i
*>(pDst0
), _simd16_extract_si(final
, 0));
687 _simd_storeu2_si(reinterpret_cast<__m128i
*>(pDst3
), reinterpret_cast<__m128i
*>(pDst2
), _simd16_extract_si(final
, 1));
691 template<SWR_FORMAT DstFormat
>
692 INLINE
static void FlatConvert(const uint8_t* pSrc
, uint8_t* pDst
, uint8_t* pDst1
)
694 static const uint32_t offset
= sizeof(simdscalar
);
696 // swizzle rgba -> bgra while we load
697 simdscalar vComp0
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(0))*offset
)); // float32 rrrrrrrr
698 simdscalar vComp1
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(1))*offset
)); // float32 gggggggg
699 simdscalar vComp2
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(2))*offset
)); // float32 bbbbbbbb
700 simdscalar vComp3
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(3))*offset
)); // float32 aaaaaaaa
703 vComp0
= _simd_max_ps(vComp0
, _simd_setzero_ps());
704 vComp0
= _simd_min_ps(vComp0
, _simd_set1_ps(1.0f
));
706 vComp1
= _simd_max_ps(vComp1
, _simd_setzero_ps());
707 vComp1
= _simd_min_ps(vComp1
, _simd_set1_ps(1.0f
));
709 vComp2
= _simd_max_ps(vComp2
, _simd_setzero_ps());
710 vComp2
= _simd_min_ps(vComp2
, _simd_set1_ps(1.0f
));
712 vComp3
= _simd_max_ps(vComp3
, _simd_setzero_ps());
713 vComp3
= _simd_min_ps(vComp3
, _simd_set1_ps(1.0f
));
715 if (FormatTraits
<DstFormat
>::isSRGB
)
717 // Gamma-correct only rgb
718 vComp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, vComp0
);
719 vComp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, vComp1
);
720 vComp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, vComp2
);
723 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724 vComp0
= _simd_mul_ps(vComp0
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
725 vComp1
= _simd_mul_ps(vComp1
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
726 vComp2
= _simd_mul_ps(vComp2
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
727 vComp3
= _simd_mul_ps(vComp3
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(3)));
729 // moving to 8 wide integer vector types
730 simdscalari src0
= _simd_cvtps_epi32(vComp0
); // padded byte rrrrrrrr
731 simdscalari src1
= _simd_cvtps_epi32(vComp1
); // padded byte gggggggg
732 simdscalari src2
= _simd_cvtps_epi32(vComp2
); // padded byte bbbbbbbb
733 simdscalari src3
= _simd_cvtps_epi32(vComp3
); // padded byte aaaaaaaa
735 #if KNOB_ARCH <= KNOB_ARCH_AVX
737 // splitting into two sets of 4 wide integer vector types
738 // because AVX doesn't have instructions to support this operation at 8 wide
739 __m128i srcLo0
= _mm256_castsi256_si128(src0
); // 000r000r000r000r
740 __m128i srcLo1
= _mm256_castsi256_si128(src1
); // 000g000g000g000g
741 __m128i srcLo2
= _mm256_castsi256_si128(src2
); // 000b000b000b000b
742 __m128i srcLo3
= _mm256_castsi256_si128(src3
); // 000a000a000a000a
744 __m128i srcHi0
= _mm256_extractf128_si256(src0
, 1); // 000r000r000r000r
745 __m128i srcHi1
= _mm256_extractf128_si256(src1
, 1); // 000g000g000g000g
746 __m128i srcHi2
= _mm256_extractf128_si256(src2
, 1); // 000b000b000b000b
747 __m128i srcHi3
= _mm256_extractf128_si256(src3
, 1); // 000a000a000a000a
749 srcLo1
= _mm_slli_si128(srcLo1
, 1); // 00g000g000g000g0
750 srcHi1
= _mm_slli_si128(srcHi1
, 1); // 00g000g000g000g0
751 srcLo2
= _mm_slli_si128(srcLo2
, 2); // 0b000b000b000b00
752 srcHi2
= _mm_slli_si128(srcHi2
, 2); // 0b000b000b000b00
753 srcLo3
= _mm_slli_si128(srcLo3
, 3); // a000a000a000a000
754 srcHi3
= _mm_slli_si128(srcHi3
, 3); // a000a000a000a000
756 srcLo0
= _mm_or_si128(srcLo0
, srcLo1
); // 00gr00gr00gr00gr
757 srcLo2
= _mm_or_si128(srcLo2
, srcLo3
); // ab00ab00ab00ab00
759 srcHi0
= _mm_or_si128(srcHi0
, srcHi1
); // 00gr00gr00gr00gr
760 srcHi2
= _mm_or_si128(srcHi2
, srcHi3
); // ab00ab00ab00ab00
762 srcLo0
= _mm_or_si128(srcLo0
, srcLo2
); // abgrabgrabgrabgr
763 srcHi0
= _mm_or_si128(srcHi0
, srcHi2
); // abgrabgrabgrabgr
765 // unpack into rows that get the tiling order correct
766 __m128i vRow00
= _mm_unpacklo_epi64(srcLo0
, srcHi0
); // abgrabgrabgrabgrabgrabgrabgrabgr
767 __m128i vRow10
= _mm_unpackhi_epi64(srcLo0
, srcHi0
);
769 simdscalari final
= _mm256_castsi128_si256(vRow00
);
770 final
= _mm256_insertf128_si256(final
, vRow10
, 1);
774 // logic is as above, only wider
775 src1
= _mm256_slli_si256(src1
, 1);
776 src2
= _mm256_slli_si256(src2
, 2);
777 src3
= _mm256_slli_si256(src3
, 3);
779 src0
= _mm256_or_si256(src0
, src1
);
780 src2
= _mm256_or_si256(src2
, src3
);
782 simdscalari final
= _mm256_or_si256(src0
, src2
);
784 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
785 final
= _mm256_permute4x64_epi64(final
, 0xD8);
788 _simd_storeu2_si((__m128i
*)pDst1
, (__m128i
*)pDst
, final
);
791 #if USE_8x2_TILE_BACKEND
792 template<SWR_FORMAT DstFormat
>
793 INLINE
static void FlatConvertNoAlpha(const uint8_t* pSrc
, uint8_t* pDst0
, uint8_t* pDst1
, uint8_t* pDst2
, uint8_t* pDst3
)
795 // swizzle rgba -> bgra while we load
796 simd16scalar comp0
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(0) * sizeof(simd16scalar
))); // float32 rrrrrrrrrrrrrrrr
797 simd16scalar comp1
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(1) * sizeof(simd16scalar
))); // float32 gggggggggggggggg
798 simd16scalar comp2
= _simd16_load_ps(reinterpret_cast<const float*>(pSrc
+ FormatTraits
<DstFormat
>::swizzle(2) * sizeof(simd16scalar
))); // float32 bbbbbbbbbbbbbbbb
801 const simd16scalar zero
= _simd16_setzero_ps();
802 const simd16scalar ones
= _simd16_set1_ps(1.0f
);
804 comp0
= _simd16_max_ps(comp0
, zero
);
805 comp0
= _simd16_min_ps(comp0
, ones
);
807 comp1
= _simd16_max_ps(comp1
, zero
);
808 comp1
= _simd16_min_ps(comp1
, ones
);
810 comp2
= _simd16_max_ps(comp2
, zero
);
811 comp2
= _simd16_min_ps(comp2
, ones
);
813 // gamma-correct only rgb
814 if (FormatTraits
<DstFormat
>::isSRGB
)
816 comp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, comp0
);
817 comp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, comp1
);
818 comp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, comp2
);
821 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
822 comp0
= _simd16_mul_ps(comp0
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
823 comp1
= _simd16_mul_ps(comp1
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
824 comp2
= _simd16_mul_ps(comp2
, _simd16_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
826 // moving to 16 wide integer vector types
827 simd16scalari src0
= _simd16_cvtps_epi32(comp0
); // padded byte rrrrrrrrrrrrrrrr
828 simd16scalari src1
= _simd16_cvtps_epi32(comp1
); // padded byte gggggggggggggggg
829 simd16scalari src2
= _simd16_cvtps_epi32(comp2
); // padded byte bbbbbbbbbbbbbbbb
831 // SOA to AOS conversion
832 src1
= _simd16_slli_epi32(src1
, 8);
833 src2
= _simd16_slli_epi32(src2
, 16);
835 simd16scalari final
= _simd16_or_si(_simd16_or_si(src0
, src1
), src2
); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
837 // de-swizzle conversion
839 simd16scalari final0
= _simd16_permute2f128_si(final
, final
, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
840 simd16scalari final1
= _simd16_permute2f128_si(final
, final
, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
842 final
= _simd16_shuffle_epi64(final0
, final1
, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
845 final
= _simd16_permute_epi32(final
, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
848 // store 8x2 memory order:
849 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
850 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
851 _simd_storeu2_si(reinterpret_cast<__m128i
*>(pDst1
), reinterpret_cast<__m128i
*>(pDst0
), _simd16_extract_si(final
, 0));
852 _simd_storeu2_si(reinterpret_cast<__m128i
*>(pDst3
), reinterpret_cast<__m128i
*>(pDst2
), _simd16_extract_si(final
, 1));
856 template<SWR_FORMAT DstFormat
>
857 INLINE
static void FlatConvertNoAlpha(const uint8_t* pSrc
, uint8_t* pDst
, uint8_t* pDst1
)
859 static const uint32_t offset
= sizeof(simdscalar
);
861 // swizzle rgba -> bgra while we load
862 simdscalar vComp0
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(0))*offset
)); // float32 rrrrrrrr
863 simdscalar vComp1
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(1))*offset
)); // float32 gggggggg
864 simdscalar vComp2
= _simd_load_ps((const float*)(pSrc
+ (FormatTraits
<DstFormat
>::swizzle(2))*offset
)); // float32 bbbbbbbb
866 vComp0
= _simd_max_ps(vComp0
, _simd_setzero_ps());
867 vComp0
= _simd_min_ps(vComp0
, _simd_set1_ps(1.0f
));
869 vComp1
= _simd_max_ps(vComp1
, _simd_setzero_ps());
870 vComp1
= _simd_min_ps(vComp1
, _simd_set1_ps(1.0f
));
872 vComp2
= _simd_max_ps(vComp2
, _simd_setzero_ps());
873 vComp2
= _simd_min_ps(vComp2
, _simd_set1_ps(1.0f
));
875 if (FormatTraits
<DstFormat
>::isSRGB
)
877 // Gamma-correct only rgb
878 vComp0
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(0, vComp0
);
879 vComp1
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(1, vComp1
);
880 vComp2
= FormatTraits
<R32G32B32A32_FLOAT
>::convertSrgb(2, vComp2
);
883 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
884 vComp0
= _simd_mul_ps(vComp0
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(0)));
885 vComp1
= _simd_mul_ps(vComp1
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(1)));
886 vComp2
= _simd_mul_ps(vComp2
, _simd_set1_ps(FormatTraits
<DstFormat
>::fromFloat(2)));
888 // moving to 8 wide integer vector types
889 simdscalari src0
= _simd_cvtps_epi32(vComp0
); // padded byte rrrrrrrr
890 simdscalari src1
= _simd_cvtps_epi32(vComp1
); // padded byte gggggggg
891 simdscalari src2
= _simd_cvtps_epi32(vComp2
); // padded byte bbbbbbbb
893 #if KNOB_ARCH <= KNOB_ARCH_AVX
895 // splitting into two sets of 4 wide integer vector types
896 // because AVX doesn't have instructions to support this operation at 8 wide
897 __m128i srcLo0
= _mm256_castsi256_si128(src0
); // 000r000r000r000r
898 __m128i srcLo1
= _mm256_castsi256_si128(src1
); // 000g000g000g000g
899 __m128i srcLo2
= _mm256_castsi256_si128(src2
); // 000b000b000b000b
901 __m128i srcHi0
= _mm256_extractf128_si256(src0
, 1); // 000r000r000r000r
902 __m128i srcHi1
= _mm256_extractf128_si256(src1
, 1); // 000g000g000g000g
903 __m128i srcHi2
= _mm256_extractf128_si256(src2
, 1); // 000b000b000b000b
905 srcLo1
= _mm_slli_si128(srcLo1
, 1); // 00g000g000g000g0
906 srcHi1
= _mm_slli_si128(srcHi1
, 1); // 00g000g000g000g0
907 srcLo2
= _mm_slli_si128(srcLo2
, 2); // 0b000b000b000b00
908 srcHi2
= _mm_slli_si128(srcHi2
, 2); // 0b000b000b000b00
910 srcLo0
= _mm_or_si128(srcLo0
, srcLo1
); // 00gr00gr00gr00gr
912 srcHi0
= _mm_or_si128(srcHi0
, srcHi1
); // 00gr00gr00gr00gr
914 srcLo0
= _mm_or_si128(srcLo0
, srcLo2
); // 0bgr0bgr0bgr0bgr
915 srcHi0
= _mm_or_si128(srcHi0
, srcHi2
); // 0bgr0bgr0bgr0bgr
917 // unpack into rows that get the tiling order correct
918 __m128i vRow00
= _mm_unpacklo_epi64(srcLo0
, srcHi0
); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
919 __m128i vRow10
= _mm_unpackhi_epi64(srcLo0
, srcHi0
);
921 simdscalari final
= _mm256_castsi128_si256(vRow00
);
922 final
= _mm256_insertf128_si256(final
, vRow10
, 1);
926 // logic is as above, only wider
927 src1
= _mm256_slli_si256(src1
, 1);
928 src2
= _mm256_slli_si256(src2
, 2);
930 src0
= _mm256_or_si256(src0
, src1
);
932 simdscalari final
= _mm256_or_si256(src0
, src2
);
934 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
935 final
= _mm256_permute4x64_epi64(final
, 0xD8);
939 _simd_storeu2_si((__m128i
*)pDst1
, (__m128i
*)pDst
, final
);
943 struct ConvertPixelsSOAtoAOS
<R32G32B32A32_FLOAT
, B8G8R8A8_UNORM
>
945 template <size_t NumDests
>
946 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
948 #if USE_8x2_TILE_BACKEND
949 FlatConvert
<B8G8R8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
951 FlatConvert
<B8G8R8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
957 struct ConvertPixelsSOAtoAOS
<R32G32B32A32_FLOAT
, B8G8R8X8_UNORM
>
959 template <size_t NumDests
>
960 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
962 #if USE_8x2_TILE_BACKEND
963 FlatConvertNoAlpha
<B8G8R8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
965 FlatConvertNoAlpha
<B8G8R8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
971 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B8G8R8A8_UNORM_SRGB
>
973 template <size_t NumDests
>
974 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
976 #if USE_8x2_TILE_BACKEND
977 FlatConvert
<B8G8R8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
979 FlatConvert
<B8G8R8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
985 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, B8G8R8X8_UNORM_SRGB
>
987 template <size_t NumDests
>
988 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
990 #if USE_8x2_TILE_BACKEND
991 FlatConvertNoAlpha
<B8G8R8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
993 FlatConvertNoAlpha
<B8G8R8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
999 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8A8_UNORM
>
1001 template <size_t NumDests
>
1002 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1004 #if USE_8x2_TILE_BACKEND
1005 FlatConvert
<R8G8B8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1007 FlatConvert
<R8G8B8A8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1013 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8X8_UNORM
>
1015 template <size_t NumDests
>
1016 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1018 #if USE_8x2_TILE_BACKEND
1019 FlatConvertNoAlpha
<R8G8B8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1021 FlatConvertNoAlpha
<R8G8B8X8_UNORM
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1027 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8A8_UNORM_SRGB
>
1029 template <size_t NumDests
>
1030 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1032 #if USE_8x2_TILE_BACKEND
1033 FlatConvert
<R8G8B8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1035 FlatConvert
<R8G8B8A8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1041 struct ConvertPixelsSOAtoAOS
< R32G32B32A32_FLOAT
, R8G8B8X8_UNORM_SRGB
>
1043 template <size_t NumDests
>
1044 INLINE
static void Convert(const uint8_t* pSrc
, uint8_t* (&ppDsts
)[NumDests
])
1046 #if USE_8x2_TILE_BACKEND
1047 FlatConvertNoAlpha
<R8G8B8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1], ppDsts
[2], ppDsts
[3]);
1049 FlatConvertNoAlpha
<R8G8B8X8_UNORM_SRGB
>(pSrc
, ppDsts
[0], ppDsts
[1]);
1054 //////////////////////////////////////////////////////////////////////////
1056 //////////////////////////////////////////////////////////////////////////
1057 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1058 struct StoreRasterTile
1060 //////////////////////////////////////////////////////////////////////////
1061 /// @brief Retrieve color from hot tile source which is always float.
1062 /// @param pSrc - Pointer to raster tile.
1063 /// @param x, y - Coordinates to raster tile.
1064 /// @param output - output color
1065 INLINE
static void GetSwizzledSrcColor(
1067 uint32_t x
, uint32_t y
,
1068 float outputColor
[4])
1070 #if USE_8x2_TILE_BACKEND
1071 typedef SimdTile_16
<SrcFormat
, DstFormat
> SimdT
;
1073 SimdT
*pSrcSimdTiles
= reinterpret_cast<SimdT
*>(pSrc
);
1075 // Compute which simd tile we're accessing within 8x8 tile.
1076 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1077 uint32_t simdIndex
= (y
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
) + (x
/ SIMD16_TILE_X_DIM
);
1079 SimdT
*pSimdTile
= &pSrcSimdTiles
[simdIndex
];
1081 uint32_t simdOffset
= (y
% SIMD16_TILE_Y_DIM
) * SIMD16_TILE_X_DIM
+ (x
% SIMD16_TILE_X_DIM
);
1083 pSimdTile
->GetSwizzledColor(simdOffset
, outputColor
);
1085 typedef SimdTile
<SrcFormat
, DstFormat
> SimdT
;
1087 SimdT
* pSrcSimdTiles
= (SimdT
*)pSrc
;
1089 // Compute which simd tile we're accessing within 8x8 tile.
1090 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1091 uint32_t simdIndex
= (y
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
) + (x
/ SIMD_TILE_X_DIM
);
1093 SimdT
* pSimdTile
= &pSrcSimdTiles
[simdIndex
];
1095 uint32_t simdOffset
= (y
% SIMD_TILE_Y_DIM
) * SIMD_TILE_X_DIM
+ (x
% SIMD_TILE_X_DIM
);
1097 pSimdTile
->GetSwizzledColor(simdOffset
, outputColor
);
1101 //////////////////////////////////////////////////////////////////////////
1102 /// @brief Stores an 8x8 raster tile to the destination surface.
1103 /// @param pSrc - Pointer to raster tile.
1104 /// @param pDstSurface - Destination surface state
1105 /// @param x, y - Coordinates to raster tile.
1106 INLINE
static void Store(
1108 SWR_SURFACE_STATE
* pDstSurface
,
1109 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
) // (x, y) pixel coordinate to start of raster tile.
1111 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1112 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1114 // For each raster tile pixel (rx, ry)
1115 for (uint32_t ry
= 0; ry
< KNOB_TILE_Y_DIM
; ++ry
)
1117 for (uint32_t rx
= 0; rx
< KNOB_TILE_X_DIM
; ++rx
)
1119 // Perform bounds checking.
1120 if (((x
+ rx
) < lodWidth
) &&
1121 ((y
+ ry
) < lodHeight
))
1124 GetSwizzledSrcColor(pSrc
, rx
, ry
, srcColor
);
1126 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>((x
+ rx
), (y
+ ry
),
1127 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1128 sampleNum
, pDstSurface
->lod
, pDstSurface
);
1130 ConvertPixelFromFloat
<DstFormat
>(pDst
, srcColor
);
1137 //////////////////////////////////////////////////////////////////////////
1138 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
1139 /// @param pSrc - Pointer to raster tile.
1140 /// @param pDstSurface - Destination surface state
1141 /// @param x, y - Coordinates to raster tile.
1142 /// @param sampleOffset - Offset between adjacent multisamples
1143 INLINE
static void Resolve(
1145 SWR_SURFACE_STATE
* pDstSurface
,
1146 uint32_t x
, uint32_t y
, uint32_t sampleOffset
, uint32_t renderTargetArrayIndex
) // (x, y) pixel coordinate to start of raster tile.
1148 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1149 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1151 float oneOverNumSamples
= 1.0f
/ pDstSurface
->numSamples
;
1153 // For each raster tile pixel (rx, ry)
1154 for (uint32_t ry
= 0; ry
< KNOB_TILE_Y_DIM
; ++ry
)
1156 for (uint32_t rx
= 0; rx
< KNOB_TILE_X_DIM
; ++rx
)
1158 // Perform bounds checking.
1159 if (((x
+ rx
) < lodWidth
) &&
1160 ((y
+ ry
) < lodHeight
))
1162 // Sum across samples
1163 float resolveColor
[4] = {0};
1164 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
1166 float sampleColor
[4] = {0};
1167 uint8_t *pSampleSrc
= pSrc
+ sampleOffset
* sampleNum
;
1168 GetSwizzledSrcColor(pSampleSrc
, rx
, ry
, sampleColor
);
1169 resolveColor
[0] += sampleColor
[0];
1170 resolveColor
[1] += sampleColor
[1];
1171 resolveColor
[2] += sampleColor
[2];
1172 resolveColor
[3] += sampleColor
[3];
1175 // Divide by numSamples to average
1176 resolveColor
[0] *= oneOverNumSamples
;
1177 resolveColor
[1] *= oneOverNumSamples
;
1178 resolveColor
[2] *= oneOverNumSamples
;
1179 resolveColor
[3] *= oneOverNumSamples
;
1181 // Use the resolve surface state
1182 SWR_SURFACE_STATE
* pResolveSurface
= (SWR_SURFACE_STATE
*)pDstSurface
->pAuxBaseAddress
;
1183 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>((x
+ rx
), (y
+ ry
),
1184 pResolveSurface
->arrayIndex
+ renderTargetArrayIndex
, pResolveSurface
->arrayIndex
+ renderTargetArrayIndex
,
1185 0, pResolveSurface
->lod
, pResolveSurface
);
1187 ConvertPixelFromFloat
<DstFormat
>(pDst
, resolveColor
);
1196 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1197 struct OptStoreRasterTile
: StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>
1200 //////////////////////////////////////////////////////////////////////////
1201 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1202 //////////////////////////////////////////////////////////////////////////
1203 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1204 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 8>, SrcFormat
, DstFormat
>
1206 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 8>, SrcFormat
, DstFormat
> GenericStoreTile
;
1207 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1208 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1210 //////////////////////////////////////////////////////////////////////////
1211 /// @brief Stores an 8x8 raster tile to the destination surface.
1212 /// @param pSrc - Pointer to raster tile.
1213 /// @param pDstSurface - Destination surface state
1214 /// @param x, y - Coordinates to raster tile.
1215 INLINE
static void Store(
1217 SWR_SURFACE_STATE
* pDstSurface
,
1218 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1220 // Punt non-full tiles to generic store
1221 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1222 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1224 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1226 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1229 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1230 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1231 #if USE_8x2_TILE_BACKEND
1233 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1234 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1238 pDst
, // row 0, col 0
1239 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1240 pDst
+ dx
/ 2, // row 0, col 1
1241 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1244 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1246 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1248 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1250 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1264 uint8_t* ppRows
[] = { pDst
, pDst
+ pDstSurface
->pitch
};
1266 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1268 uint8_t* ppStartRows
[] = { ppRows
[0], ppRows
[1] };
1270 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1272 // Format conversion and convert from SOA to AOS, and store the rows.
1273 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppRows
);
1275 ppRows
[0] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1276 ppRows
[1] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1277 pSrc
+= SRC_BYTES_PER_PIXEL
* KNOB_SIMD_WIDTH
;
1280 ppRows
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1281 ppRows
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1287 //////////////////////////////////////////////////////////////////////////
1288 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1289 //////////////////////////////////////////////////////////////////////////
1290 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1291 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 16>, SrcFormat
, DstFormat
>
1293 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 16>, SrcFormat
, DstFormat
> GenericStoreTile
;
1294 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1295 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1297 //////////////////////////////////////////////////////////////////////////
1298 /// @brief Stores an 8x8 raster tile to the destination surface.
1299 /// @param pSrc - Pointer to raster tile.
1300 /// @param pDstSurface - Destination surface state
1301 /// @param x, y - Coordinates to raster tile.
1302 INLINE
static void Store(
1304 SWR_SURFACE_STATE
* pDstSurface
,
1305 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1307 // Punt non-full tiles to generic store
1308 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1309 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1311 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1313 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1316 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1317 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1318 #if USE_8x2_TILE_BACKEND
1320 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1321 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1325 pDst
, // row 0, col 0
1326 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1327 pDst
+ dx
/ 2, // row 0, col 1
1328 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1331 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1333 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1335 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1337 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1351 uint8_t* ppRows
[] = { pDst
, pDst
+ pDstSurface
->pitch
};
1353 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1355 uint8_t* ppStartRows
[] = { ppRows
[0], ppRows
[1] };
1357 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1359 // Format conversion and convert from SOA to AOS, and store the rows.
1360 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppRows
);
1362 ppRows
[0] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1363 ppRows
[1] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1364 pSrc
+= SRC_BYTES_PER_PIXEL
* KNOB_SIMD_WIDTH
;
1367 ppRows
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1368 ppRows
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1374 //////////////////////////////////////////////////////////////////////////
1375 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1376 //////////////////////////////////////////////////////////////////////////
1377 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1378 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 32>, SrcFormat
, DstFormat
>
1380 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1381 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1382 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1384 //////////////////////////////////////////////////////////////////////////
1385 /// @brief Stores an 8x8 raster tile to the destination surface.
1386 /// @param pSrc - Pointer to raster tile.
1387 /// @param pDstSurface - Destination surface state
1388 /// @param x, y - Coordinates to raster tile.
1389 INLINE
static void Store(
1391 SWR_SURFACE_STATE
* pDstSurface
,
1392 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1394 // Punt non-full tiles to generic store
1395 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1396 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1398 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1400 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1403 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1404 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1405 #if USE_8x2_TILE_BACKEND
1407 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1408 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1412 pDst
, // row 0, col 0
1413 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1414 pDst
+ dx
/ 2, // row 0, col 1
1415 pDst
+ pDstSurface
->pitch
+ dx
/ 2 // row 1, col 1
1418 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1420 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1422 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1424 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1438 uint8_t* ppRows
[] = { pDst
, pDst
+ pDstSurface
->pitch
};
1440 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1442 uint8_t* ppStartRows
[] = { ppRows
[0], ppRows
[1] };
1444 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1446 // Format conversion and convert from SOA to AOS, and store the rows.
1447 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppRows
);
1449 ppRows
[0] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1450 ppRows
[1] += KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1451 pSrc
+= SRC_BYTES_PER_PIXEL
* KNOB_SIMD_WIDTH
;
1454 ppRows
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1455 ppRows
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1461 //////////////////////////////////////////////////////////////////////////
1462 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1463 //////////////////////////////////////////////////////////////////////////
1464 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1465 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 64>, SrcFormat
, DstFormat
>
1467 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 64>, SrcFormat
, DstFormat
> GenericStoreTile
;
1468 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1469 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1470 static const size_t MAX_DST_COLUMN_BYTES
= 16;
1471 #if !USE_8x2_TILE_BACKEND
1472 static const size_t SRC_COLUMN_BYTES
= KNOB_SIMD_WIDTH
* SRC_BYTES_PER_PIXEL
;
1473 static const size_t DST_COLUMN_BYTES_PER_SRC
= KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1476 //////////////////////////////////////////////////////////////////////////
1477 /// @brief Stores an 8x8 raster tile to the destination surface.
1478 /// @param pSrc - Pointer to raster tile.
1479 /// @param pDstSurface - Destination surface state
1480 /// @param x, y - Coordinates to raster tile.
1481 INLINE
static void Store(
1483 SWR_SURFACE_STATE
* pDstSurface
,
1484 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1486 // Punt non-full tiles to generic store
1487 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1488 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1490 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1492 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1495 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1496 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1497 #if USE_8x2_TILE_BACKEND
1499 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1500 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
;
1502 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1503 static_assert(dx
== MAX_DST_COLUMN_BYTES
* 4, "Invalid column offsets");
1507 pDst
, // row 0, col 0
1508 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1509 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1510 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1511 pDst
+ MAX_DST_COLUMN_BYTES
* 2, // row 0, col 2
1512 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 2, // row 1, col 2
1513 pDst
+ MAX_DST_COLUMN_BYTES
* 3, // row 0, col 3
1514 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 3 // row 1, col 3
1517 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1519 // Raster tile width is same as simd16 tile width
1520 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1522 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1524 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1526 for (uint32_t i
= 0; i
< sizeof(ppDsts
) / sizeof(ppDsts
[0]); i
+= 1)
1534 pDst
, // row 0, col 0
1535 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1536 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1537 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1540 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1542 uint8_t* ppStartRows
[] =
1550 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1552 // Format conversion and convert from SOA to AOS, and store the rows.
1553 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1555 ppDsts
[0] += DST_COLUMN_BYTES_PER_SRC
;
1556 ppDsts
[1] += DST_COLUMN_BYTES_PER_SRC
;
1557 ppDsts
[2] += DST_COLUMN_BYTES_PER_SRC
;
1558 ppDsts
[3] += DST_COLUMN_BYTES_PER_SRC
;
1559 pSrc
+= SRC_COLUMN_BYTES
;
1562 ppDsts
[0] = ppStartRows
[0] + 2 * pDstSurface
->pitch
;
1563 ppDsts
[1] = ppStartRows
[1] + 2 * pDstSurface
->pitch
;
1564 ppDsts
[2] = ppStartRows
[2] + 2 * pDstSurface
->pitch
;
1565 ppDsts
[3] = ppStartRows
[3] + 2 * pDstSurface
->pitch
;
1571 //////////////////////////////////////////////////////////////////////////
1572 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1573 //////////////////////////////////////////////////////////////////////////
1574 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1575 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_NONE
, 128>, SrcFormat
, DstFormat
>
1577 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_NONE
, 128>, SrcFormat
, DstFormat
> GenericStoreTile
;
1578 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1579 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1580 static const size_t MAX_DST_COLUMN_BYTES
= 16;
1581 #if !USE_8x2_TILE_BACKEND
1582 static const size_t SRC_COLUMN_BYTES
= KNOB_SIMD_WIDTH
* SRC_BYTES_PER_PIXEL
;
1583 static const size_t DST_COLUMN_BYTES_PER_SRC
= KNOB_SIMD_WIDTH
* DST_BYTES_PER_PIXEL
/ 2;
1586 //////////////////////////////////////////////////////////////////////////
1587 /// @brief Stores an 8x8 raster tile to the destination surface.
1588 /// @param pSrc - Pointer to raster tile.
1589 /// @param pDstSurface - Destination surface state
1590 /// @param x, y - Coordinates to raster tile.
1591 INLINE
static void Store(
1593 SWR_SURFACE_STATE
* pDstSurface
,
1594 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1596 // Punt non-full tiles to generic store
1597 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1598 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1600 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1602 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1605 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1606 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1607 #if USE_8x2_TILE_BACKEND
1609 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1610 const uint32_t dy
= SIMD16_TILE_Y_DIM
* pDstSurface
->pitch
;
1612 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1613 static_assert(dx
== MAX_DST_COLUMN_BYTES
* 8, "Invalid column offsets");
1617 pDst
, // row 0, col 0
1618 pDst
+ pDstSurface
->pitch
, // row 1, col 0
1619 pDst
+ MAX_DST_COLUMN_BYTES
, // row 0, col 1
1620 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
, // row 1, col 1
1621 pDst
+ MAX_DST_COLUMN_BYTES
* 2, // row 0, col 2
1622 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 2, // row 1, col 2
1623 pDst
+ MAX_DST_COLUMN_BYTES
* 3, // row 0, col 3
1624 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 3, // row 1, col 3
1625 pDst
+ MAX_DST_COLUMN_BYTES
* 4, // row 0, col 4
1626 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 4, // row 1, col 4
1627 pDst
+ MAX_DST_COLUMN_BYTES
* 5, // row 0, col 5
1628 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 5, // row 1, col 5
1629 pDst
+ MAX_DST_COLUMN_BYTES
* 6, // row 0, col 6
1630 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 6, // row 1, col 6
1631 pDst
+ MAX_DST_COLUMN_BYTES
* 7, // row 0, col 7
1632 pDst
+ pDstSurface
->pitch
+ MAX_DST_COLUMN_BYTES
* 7, // row 1, col 7
1635 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1637 // Raster tile width is same as simd16 tile width
1638 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1640 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1642 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1644 for (uint32_t i
= 0; i
< sizeof(ppDsts
) / sizeof(ppDsts
[0]); i
+= 1)
1655 // Need 8 pointers, 4 columns of 2 rows each
1656 for (uint32_t y
= 0; y
< 2; ++y
)
1658 for (uint32_t x
= 0; x
< 4; ++x
)
1660 ptrs
.ppDsts
[x
* 2 + y
] = pDst
+ y
* pDstSurface
->pitch
+ x
* MAX_DST_COLUMN_BYTES
;
1664 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
1666 DstPtrs startPtrs
= ptrs
;
1668 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
1670 // Format conversion and convert from SOA to AOS, and store the rows.
1671 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ptrs
.ppDsts
);
1673 ptrs
.ppDsts
[0] += DST_COLUMN_BYTES_PER_SRC
;
1674 ptrs
.ppDsts
[1] += DST_COLUMN_BYTES_PER_SRC
;
1675 ptrs
.ppDsts
[2] += DST_COLUMN_BYTES_PER_SRC
;
1676 ptrs
.ppDsts
[3] += DST_COLUMN_BYTES_PER_SRC
;
1677 ptrs
.ppDsts
[4] += DST_COLUMN_BYTES_PER_SRC
;
1678 ptrs
.ppDsts
[5] += DST_COLUMN_BYTES_PER_SRC
;
1679 ptrs
.ppDsts
[6] += DST_COLUMN_BYTES_PER_SRC
;
1680 ptrs
.ppDsts
[7] += DST_COLUMN_BYTES_PER_SRC
;
1681 pSrc
+= SRC_COLUMN_BYTES
;
1684 ptrs
.ppDsts
[0] = startPtrs
.ppDsts
[0] + 2 * pDstSurface
->pitch
;
1685 ptrs
.ppDsts
[1] = startPtrs
.ppDsts
[1] + 2 * pDstSurface
->pitch
;
1686 ptrs
.ppDsts
[2] = startPtrs
.ppDsts
[2] + 2 * pDstSurface
->pitch
;
1687 ptrs
.ppDsts
[3] = startPtrs
.ppDsts
[3] + 2 * pDstSurface
->pitch
;
1688 ptrs
.ppDsts
[4] = startPtrs
.ppDsts
[4] + 2 * pDstSurface
->pitch
;
1689 ptrs
.ppDsts
[5] = startPtrs
.ppDsts
[5] + 2 * pDstSurface
->pitch
;
1690 ptrs
.ppDsts
[6] = startPtrs
.ppDsts
[6] + 2 * pDstSurface
->pitch
;
1691 ptrs
.ppDsts
[7] = startPtrs
.ppDsts
[7] + 2 * pDstSurface
->pitch
;
1697 //////////////////////////////////////////////////////////////////////////
1698 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1699 //////////////////////////////////////////////////////////////////////////
1700 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1701 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 8>, SrcFormat
, DstFormat
>
1703 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 8>, SrcFormat
, DstFormat
> GenericStoreTile
;
1704 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1706 //////////////////////////////////////////////////////////////////////////
1707 /// @brief Stores an 8x8 raster tile to the destination surface.
1708 /// @param pSrc - Pointer to raster tile.
1709 /// @param pDstSurface - Destination surface state
1710 /// @param x, y - Coordinates to raster tile.
1711 INLINE
static void Store(
1713 SWR_SURFACE_STATE
* pDstSurface
,
1714 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1716 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1718 // Punt non-full tiles to generic store
1719 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1720 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1722 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1724 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1727 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1728 // We can compute the offsets to each column within the raster tile once and increment from these.
1729 #if USE_8x2_TILE_BACKEND
1730 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1731 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1732 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1734 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1736 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1740 pDst
+ DestRowWidthBytes
,
1741 pDst
+ DestRowWidthBytes
/ 4,
1742 pDst
+ DestRowWidthBytes
+ DestRowWidthBytes
/ 4
1745 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1747 // Raster tile width is same as simd16 tile width
1748 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1750 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1752 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1760 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1761 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1762 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1764 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1765 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
1767 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1768 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
1770 uint32_t rowOffset
= row
* DestRowWidthBytes
;
1772 uint8_t* pRow
= pCol0
+ rowOffset
;
1773 uint8_t* ppDsts
[] = { pRow
, pRow
+ DestRowWidthBytes
};
1775 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1778 ppDsts
[0] += DestRowWidthBytes
/ 4;
1779 ppDsts
[1] += DestRowWidthBytes
/ 4;
1781 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1788 //////////////////////////////////////////////////////////////////////////
1789 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1790 //////////////////////////////////////////////////////////////////////////
1791 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1792 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 16>, SrcFormat
, DstFormat
>
1794 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 16>, SrcFormat
, DstFormat
> GenericStoreTile
;
1795 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1797 //////////////////////////////////////////////////////////////////////////
1798 /// @brief Stores an 8x8 raster tile to the destination surface.
1799 /// @param pSrc - Pointer to raster tile.
1800 /// @param pDstSurface - Destination surface state
1801 /// @param x, y - Coordinates to raster tile.
1802 INLINE
static void Store(
1804 SWR_SURFACE_STATE
* pDstSurface
,
1805 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1807 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1809 // Punt non-full tiles to generic store
1810 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1811 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1813 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1815 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1818 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1819 // We can compute the offsets to each column within the raster tile once and increment from these.
1820 #if USE_8x2_TILE_BACKEND
1821 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1822 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1823 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1825 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
1827 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1831 pDst
+ DestRowWidthBytes
,
1832 pDst
+ DestRowWidthBytes
/ 2,
1833 pDst
+ DestRowWidthBytes
+ DestRowWidthBytes
/ 2
1836 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1838 // Raster tile width is same as simd16 tile width
1839 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
1841 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1843 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1851 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1852 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1853 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1855 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1856 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
1858 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1859 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
1861 uint32_t rowOffset
= row
* DestRowWidthBytes
;
1863 uint8_t* pRow
= pCol0
+ rowOffset
;
1864 uint8_t* ppDsts
[] = { pRow
, pRow
+ DestRowWidthBytes
};
1866 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1869 ppDsts
[0] += DestRowWidthBytes
/ 2;
1870 ppDsts
[1] += DestRowWidthBytes
/ 2;
1872 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1879 //////////////////////////////////////////////////////////////////////////
1880 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1881 //////////////////////////////////////////////////////////////////////////
1882 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1883 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_XMAJOR
, 32>, SrcFormat
, DstFormat
>
1885 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_XMAJOR
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1886 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1887 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
1889 //////////////////////////////////////////////////////////////////////////
1890 /// @brief Stores an 8x8 raster tile to the destination surface.
1891 /// @param pSrc - Pointer to raster tile.
1892 /// @param pDstSurface - Destination surface state
1893 /// @param x, y - Coordinates to raster tile.
1894 INLINE
static void Store(
1896 SWR_SURFACE_STATE
* pDstSurface
,
1897 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1899 static const uint32_t DestRowWidthBytes
= 512; // 512B rows
1901 // Punt non-full tiles to generic store
1902 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1903 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1905 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1907 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
1910 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1911 // We can compute the offsets to each column within the raster tile once and increment from these.
1912 #if USE_8x2_TILE_BACKEND
1913 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1914 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1916 const uint32_t dx
= SIMD16_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1917 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
- KNOB_TILE_X_DIM
* DST_BYTES_PER_PIXEL
;
1921 pDst
, // row 0, col 0
1922 pDst
+ DestRowWidthBytes
, // row 1, col 0
1923 pDst
+ dx
/ 2, // row 0, col 1
1924 pDst
+ DestRowWidthBytes
+ dx
/ 2 // row 1, col 1
1927 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
1929 for (uint32_t xx
= 0; xx
< KNOB_TILE_X_DIM
; xx
+= SIMD16_TILE_X_DIM
)
1931 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1933 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
1947 uint8_t *pRow0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
1948 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
1949 uint8_t* pRow1
= pRow0
+ DestRowWidthBytes
;
1951 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
1953 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
; col
+= SIMD_TILE_X_DIM
)
1955 uint32_t xRowOffset
= col
* (FormatTraits
<DstFormat
>::bpp
/ 8);
1957 uint8_t* ppDsts
[] = { pRow0
+ xRowOffset
, pRow1
+ xRowOffset
};
1958 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
1960 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1961 pSrc
+= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
1964 pRow0
+= (DestRowWidthBytes
* 2);
1965 pRow1
+= (DestRowWidthBytes
* 2);
1971 //////////////////////////////////////////////////////////////////////////
1972 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1973 //////////////////////////////////////////////////////////////////////////
1974 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
1975 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 32>, SrcFormat
, DstFormat
>
1977 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 32>, SrcFormat
, DstFormat
> GenericStoreTile
;
1978 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
1980 //////////////////////////////////////////////////////////////////////////
1981 /// @brief Stores an 8x8 raster tile to the destination surface.
1982 /// @param pSrc - Pointer to raster tile.
1983 /// @param pDstSurface - Destination surface state
1984 /// @param x, y - Coordinates to raster tile.
1985 INLINE
static void Store(
1987 SWR_SURFACE_STATE
* pDstSurface
,
1988 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
1990 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
1991 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
1993 // Punt non-full tiles to generic store
1994 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
1995 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
1997 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
1999 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
2002 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2003 // We can compute the offsets to each column within the raster tile once and increment from these.
2004 #if USE_8x2_TILE_BACKEND
2005 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2006 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2007 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2009 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2010 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
2012 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2015 pDst
, // row 0, col 0
2016 pDst
+ DestRowWidthBytes
, // row 1, col 0
2017 pDst
+ DestColumnBytes
, // row 0, col 1
2018 pDst
+ DestRowWidthBytes
+ DestColumnBytes
// row 1, col 1
2021 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
2023 // Raster tile width is same as simd16 tile width
2024 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
2026 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2028 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
2036 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2037 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2038 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2040 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2041 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
2043 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2044 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
2046 uint32_t rowOffset
= row
* DestRowWidthBytes
;
2048 uint8_t* pRow
= pCol0
+ rowOffset
;
2049 uint8_t* ppDsts
[] = { pRow
, pRow
+ DestRowWidthBytes
};
2051 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2054 ppDsts
[0] += DestColumnBytes
;
2055 ppDsts
[1] += DestColumnBytes
;
2057 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2064 //////////////////////////////////////////////////////////////////////////
2065 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2066 //////////////////////////////////////////////////////////////////////////
2067 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
2068 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 64>, SrcFormat
, DstFormat
>
2070 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 64>, SrcFormat
, DstFormat
> GenericStoreTile
;
2071 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
2073 //////////////////////////////////////////////////////////////////////////
2074 /// @brief Stores an 8x8 raster tile to the destination surface.
2075 /// @param pSrc - Pointer to raster tile.
2076 /// @param pDstSurface - Destination surface state
2077 /// @param x, y - Coordinates to raster tile.
2078 INLINE
static void Store(
2080 SWR_SURFACE_STATE
* pDstSurface
,
2081 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
2083 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
2084 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
2086 // Punt non-full tiles to generic store
2087 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
2088 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
2090 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
2092 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
2095 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2096 // We can compute the offsets to each column within the raster tile once and increment from these.
2097 #if USE_8x2_TILE_BACKEND
2098 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2099 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2100 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2102 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2103 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
2105 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2108 pDst
, // row 0, col 0
2109 pDst
+ DestRowWidthBytes
, // row 1, col 0
2110 pDst
+ DestColumnBytes
, // row 0, col 1
2111 pDst
+ DestRowWidthBytes
+ DestColumnBytes
, // row 1, col 1
2112 pDst
+ DestColumnBytes
* 2, // row 0, col 2
2113 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 2, // row 1, col 2
2114 pDst
+ DestColumnBytes
* 3, // row 0, col 3
2115 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 3 // row 1, col 3
2118 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
2120 // Raster tile width is same as simd16 tile width
2121 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
2123 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2125 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
2127 for (uint32_t i
= 0; i
< sizeof(ppDsts
) / sizeof(ppDsts
[0]); i
+= 1)
2133 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2134 uint8_t* pCol0
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2135 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2136 uint8_t* pCol1
= pCol0
+ DestColumnBytes
;
2138 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2139 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2140 uint32_t pSrcInc
= (FormatTraits
<SrcFormat
>::bpp
* KNOB_SIMD_WIDTH
) / 8;
2142 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2143 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
; row
+= SIMD_TILE_Y_DIM
)
2145 uint32_t rowOffset
= row
* DestRowWidthBytes
;
2149 pCol0
+ rowOffset
+ DestRowWidthBytes
,
2151 pCol1
+ rowOffset
+ DestRowWidthBytes
,
2154 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2157 ppDsts
[0] += DestColumnBytes
* 2;
2158 ppDsts
[1] += DestColumnBytes
* 2;
2159 ppDsts
[2] += DestColumnBytes
* 2;
2160 ppDsts
[3] += DestColumnBytes
* 2;
2162 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2169 //////////////////////////////////////////////////////////////////////////
2170 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2171 //////////////////////////////////////////////////////////////////////////
2172 template<SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
2173 struct OptStoreRasterTile
< TilingTraits
<SWR_TILE_MODE_YMAJOR
, 128>, SrcFormat
, DstFormat
>
2175 typedef StoreRasterTile
<TilingTraits
<SWR_TILE_MODE_YMAJOR
, 128>, SrcFormat
, DstFormat
> GenericStoreTile
;
2176 #if USE_8x2_TILE_BACKEND
2177 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
2180 static const size_t TILE_Y_COL_WIDTH_BYTES
= 16;
2181 static const size_t TILE_Y_ROWS
= 32;
2182 static const size_t TILE_Y_COL_BYTES
= TILE_Y_ROWS
* TILE_Y_COL_WIDTH_BYTES
;
2184 static const size_t DST_BYTES_PER_PIXEL
= FormatTraits
<DstFormat
>::bpp
/ 8;
2185 static const size_t SRC_BYTES_PER_PIXEL
= FormatTraits
<SrcFormat
>::bpp
/ 8;
2186 static const size_t MAX_DST_COLUMN_BYTES
= 16;
2188 static const size_t SRC_COLUMN_BYTES
= KNOB_SIMD_WIDTH
* SRC_BYTES_PER_PIXEL
;
2189 static const size_t DST_COLUMN_BYTES_PER_SRC
= TILE_Y_COL_BYTES
* 4;
2192 //////////////////////////////////////////////////////////////////////////
2193 /// @brief Stores an 8x8 raster tile to the destination surface.
2194 /// @param pSrc - Pointer to raster tile.
2195 /// @param pDstSurface - Destination surface state
2196 /// @param x, y - Coordinates to raster tile.
2197 INLINE
static void Store(
2199 SWR_SURFACE_STATE
* pDstSurface
,
2200 uint32_t x
, uint32_t y
, uint32_t sampleNum
, uint32_t renderTargetArrayIndex
)
2202 #if USE_8x2_TILE_BACKEND
2203 static const uint32_t DestRowWidthBytes
= 16; // 16B rows
2204 static const uint32_t DestColumnBytes
= DestRowWidthBytes
* 32; // 16B x 32 rows.
2207 // Punt non-full tiles to generic store
2208 uint32_t lodWidth
= std::max(pDstSurface
->width
>> pDstSurface
->lod
, 1U);
2209 uint32_t lodHeight
= std::max(pDstSurface
->height
>> pDstSurface
->lod
, 1U);
2211 if (x
+ KNOB_TILE_X_DIM
> lodWidth
|| y
+ KNOB_TILE_Y_DIM
> lodHeight
)
2213 return GenericStoreTile::Store(pSrc
, pDstSurface
, x
, y
, sampleNum
, renderTargetArrayIndex
);
2216 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2217 // We can compute the offsets to each column within the raster tile once and increment from these.
2218 #if USE_8x2_TILE_BACKEND
2219 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2220 uint8_t *pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2221 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2223 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2224 const uint32_t dy
= SIMD16_TILE_Y_DIM
* DestRowWidthBytes
;
2226 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2229 pDst
, // row 0, col 0
2230 pDst
+ DestRowWidthBytes
, // row 1, col 0
2231 pDst
+ DestColumnBytes
, // row 0, col 1
2232 pDst
+ DestRowWidthBytes
+ DestColumnBytes
, // row 1, col 1
2233 pDst
+ DestColumnBytes
* 2, // row 0, col 2
2234 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 2, // row 1, col 2
2235 pDst
+ DestColumnBytes
* 3, // row 0, col 3
2236 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 3, // row 1, col 3
2237 pDst
+ DestColumnBytes
* 4, // row 0, col 4
2238 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 4, // row 1, col 4
2239 pDst
+ DestColumnBytes
* 5, // row 0, col 5
2240 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 5, // row 1, col 5
2241 pDst
+ DestColumnBytes
* 6, // row 0, col 6
2242 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 6, // row 1, col 6
2243 pDst
+ DestColumnBytes
* 7, // row 0, col 7
2244 pDst
+ DestRowWidthBytes
+ DestColumnBytes
* 7 // row 1, col 7
2247 for (uint32_t yy
= 0; yy
< KNOB_TILE_Y_DIM
; yy
+= SIMD16_TILE_Y_DIM
)
2249 // Raster tile width is same as simd16 tile width
2250 static_assert(KNOB_TILE_X_DIM
== SIMD16_TILE_X_DIM
, "Invalid tile x dim");
2252 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ppDsts
);
2254 pSrc
+= KNOB_SIMD16_WIDTH
* SRC_BYTES_PER_PIXEL
;
2256 for (uint32_t i
= 0; i
< sizeof(ppDsts
) / sizeof(ppDsts
[0]); i
+= 1)
2262 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2263 uint8_t* pDst
= (uint8_t*)ComputeSurfaceAddress
<false, false>(x
, y
, pDstSurface
->arrayIndex
+ renderTargetArrayIndex
,
2264 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, sampleNum
, pDstSurface
->lod
, pDstSurface
);
2270 // Need 8 pointers, 4 columns of 2 rows each
2271 for (uint32_t y
= 0; y
< 2; ++y
)
2273 for (uint32_t x
= 0; x
< 4; ++x
)
2275 ptrs
.ppDsts
[x
* 2 + y
] = pDst
+ y
* TILE_Y_COL_WIDTH_BYTES
+ x
* TILE_Y_COL_BYTES
;
2279 for (uint32_t row
= 0; row
< KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
; ++row
)
2281 DstPtrs startPtrs
= ptrs
;
2283 for (uint32_t col
= 0; col
< KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
; ++col
)
2285 // Format conversion and convert from SOA to AOS, and store the rows.
2286 ConvertPixelsSOAtoAOS
<SrcFormat
, DstFormat
>::Convert(pSrc
, ptrs
.ppDsts
);
2288 ptrs
.ppDsts
[0] += DST_COLUMN_BYTES_PER_SRC
;
2289 ptrs
.ppDsts
[1] += DST_COLUMN_BYTES_PER_SRC
;
2290 ptrs
.ppDsts
[2] += DST_COLUMN_BYTES_PER_SRC
;
2291 ptrs
.ppDsts
[3] += DST_COLUMN_BYTES_PER_SRC
;
2292 ptrs
.ppDsts
[4] += DST_COLUMN_BYTES_PER_SRC
;
2293 ptrs
.ppDsts
[5] += DST_COLUMN_BYTES_PER_SRC
;
2294 ptrs
.ppDsts
[6] += DST_COLUMN_BYTES_PER_SRC
;
2295 ptrs
.ppDsts
[7] += DST_COLUMN_BYTES_PER_SRC
;
2296 pSrc
+= SRC_COLUMN_BYTES
;
2299 ptrs
.ppDsts
[0] = startPtrs
.ppDsts
[0] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2300 ptrs
.ppDsts
[1] = startPtrs
.ppDsts
[1] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2301 ptrs
.ppDsts
[2] = startPtrs
.ppDsts
[2] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2302 ptrs
.ppDsts
[3] = startPtrs
.ppDsts
[3] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2303 ptrs
.ppDsts
[4] = startPtrs
.ppDsts
[4] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2304 ptrs
.ppDsts
[5] = startPtrs
.ppDsts
[5] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2305 ptrs
.ppDsts
[6] = startPtrs
.ppDsts
[6] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2306 ptrs
.ppDsts
[7] = startPtrs
.ppDsts
[7] + 2 * TILE_Y_COL_WIDTH_BYTES
;
2312 //////////////////////////////////////////////////////////////////////////
2313 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2314 //////////////////////////////////////////////////////////////////////////
2315 template<typename TTraits
, SWR_FORMAT SrcFormat
, SWR_FORMAT DstFormat
>
2316 struct StoreMacroTile
2318 //////////////////////////////////////////////////////////////////////////
2319 /// @brief Stores a macrotile to the destination surface using safe implementation.
2320 /// @param pSrc - Pointer to macro tile.
2321 /// @param pDstSurface - Destination surface state
2322 /// @param x, y - Coordinates to macro tile
2323 static void StoreGeneric(
2324 uint8_t *pSrcHotTile
,
2325 SWR_SURFACE_STATE
* pDstSurface
,
2326 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
)
2328 PFN_STORE_TILES_INTERNAL pfnStore
;
2329 pfnStore
= StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
;
2331 // Store each raster tile from the hot tile to the destination surface.
2332 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
2334 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
2336 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
2338 pfnStore(pSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleNum
, renderTargetArrayIndex
);
2339 pSrcHotTile
+= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
2346 typedef void(*PFN_STORE_TILES_INTERNAL
)(uint8_t*, SWR_SURFACE_STATE
*, uint32_t, uint32_t, uint32_t, uint32_t);
2347 //////////////////////////////////////////////////////////////////////////
2348 /// @brief Stores a macrotile to the destination surface.
2349 /// @param pSrc - Pointer to macro tile.
2350 /// @param pDstSurface - Destination surface state
2351 /// @param x, y - Coordinates to macro tile
2353 uint8_t *pSrcHotTile
,
2354 SWR_SURFACE_STATE
* pDstSurface
,
2355 uint32_t x
, uint32_t y
, uint32_t renderTargetArrayIndex
)
2357 PFN_STORE_TILES_INTERNAL pfnStore
[SWR_MAX_NUM_MULTISAMPLES
];
2359 for (uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
2361 size_t dstSurfAddress
= (size_t)ComputeSurfaceAddress
<false, false>(
2364 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, // z for 3D surfaces
2365 pDstSurface
->arrayIndex
+ renderTargetArrayIndex
, // array index for 2D arrays
2370 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2371 bool bForceGeneric
= ((pDstSurface
->tileMode
!= SWR_TILE_NONE
) && (0 != (dstSurfAddress
& 0xfff))) ||
2372 (pDstSurface
->bInterleavedSamples
);
2374 pfnStore
[sampleNum
] = (bForceGeneric
|| KNOB_USE_GENERIC_STORETILE
) ? StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
: OptStoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Store
;
2377 // Save original for pSrcHotTile resolve.
2378 uint8_t *pResolveSrcHotTile
= pSrcHotTile
;
2380 // Store each raster tile from the hot tile to the destination surface.
2381 for(uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
2383 for(uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
2385 for(uint32_t sampleNum
= 0; sampleNum
< pDstSurface
->numSamples
; sampleNum
++)
2387 pfnStore
[sampleNum
](pSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleNum
, renderTargetArrayIndex
);
2388 pSrcHotTile
+= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
2393 if (pDstSurface
->pAuxBaseAddress
)
2395 uint32_t sampleOffset
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<SrcFormat
>::bpp
/ 8);
2396 // Store each raster tile from the hot tile to the destination surface.
2397 for(uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
2399 for(uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
2401 StoreRasterTile
<TTraits
, SrcFormat
, DstFormat
>::Resolve(pResolveSrcHotTile
, pDstSurface
, (x
+ col
), (y
+ row
), sampleOffset
, renderTargetArrayIndex
);
2402 pResolveSrcHotTile
+= sampleOffset
* pDstSurface
->numSamples
;
2409 //////////////////////////////////////////////////////////////////////////
2410 /// InitStoreTilesTable - Helper for setting up the tables.
2411 template <SWR_TILE_MODE TTileMode
, size_t NumTileModesT
, size_t ArraySizeT
>
2412 void InitStoreTilesTableColor_Half1(
2413 PFN_STORE_TILES (&table
)[NumTileModesT
][ArraySizeT
])
2415 table
[TTileMode
][R32G32B32A32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_FLOAT
>::Store
;
2416 table
[TTileMode
][R32G32B32A32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_SINT
>::Store
;
2417 table
[TTileMode
][R32G32B32A32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_UINT
>::Store
;
2418 table
[TTileMode
][R32G32B32X32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32X32_FLOAT
>::Store
;
2419 table
[TTileMode
][R32G32B32A32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_SSCALED
>::Store
;
2420 table
[TTileMode
][R32G32B32A32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 128>, R32G32B32A32_FLOAT
, R32G32B32A32_USCALED
>::Store
;
2421 table
[TTileMode
][R32G32B32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_FLOAT
>::Store
;
2422 table
[TTileMode
][R32G32B32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_SINT
>::Store
;
2423 table
[TTileMode
][R32G32B32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_UINT
>::Store
;
2424 table
[TTileMode
][R32G32B32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_SSCALED
>::Store
;
2425 table
[TTileMode
][R32G32B32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 96>, R32G32B32A32_FLOAT
, R32G32B32_USCALED
>::Store
;
2426 table
[TTileMode
][R16G16B16A16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_UNORM
>::Store
;
2427 table
[TTileMode
][R16G16B16A16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SNORM
>::Store
;
2428 table
[TTileMode
][R16G16B16A16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SINT
>::Store
;
2429 table
[TTileMode
][R16G16B16A16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_UINT
>::Store
;
2430 table
[TTileMode
][R16G16B16A16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_FLOAT
>::Store
;
2431 table
[TTileMode
][R32G32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_FLOAT
>::Store
;
2432 table
[TTileMode
][R32G32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_SINT
>::Store
;
2433 table
[TTileMode
][R32G32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_UINT
>::Store
;
2434 table
[TTileMode
][R32_FLOAT_X8X24_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32_FLOAT_X8X24_TYPELESS
>::Store
;
2435 table
[TTileMode
][X32_TYPELESS_G8X24_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, X32_TYPELESS_G8X24_UINT
>::Store
;
2436 table
[TTileMode
][R16G16B16X16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16X16_UNORM
>::Store
;
2437 table
[TTileMode
][R16G16B16X16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16X16_FLOAT
>::Store
;
2438 table
[TTileMode
][R16G16B16A16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_SSCALED
>::Store
;
2439 table
[TTileMode
][R16G16B16A16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R16G16B16A16_USCALED
>::Store
;
2440 table
[TTileMode
][R32G32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_SSCALED
>::Store
;
2441 table
[TTileMode
][R32G32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32G32B32A32_FLOAT
, R32G32_USCALED
>::Store
;
2442 table
[TTileMode
][B8G8R8A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8A8_UNORM
>::Store
;
2443 table
[TTileMode
][B8G8R8A8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8A8_UNORM_SRGB
>::Store
;
2444 table
[TTileMode
][R10G10B10A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UNORM
>::StoreGeneric
;
2445 table
[TTileMode
][R10G10B10A2_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UNORM_SRGB
>::StoreGeneric
;
2446 table
[TTileMode
][R10G10B10A2_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_UINT
>::StoreGeneric
;
2447 table
[TTileMode
][R8G8B8A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UNORM
>::Store
;
2448 table
[TTileMode
][R8G8B8A8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UNORM_SRGB
>::Store
;
2449 table
[TTileMode
][R8G8B8A8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SNORM
>::Store
;
2450 table
[TTileMode
][R8G8B8A8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SINT
>::Store
;
2451 table
[TTileMode
][R8G8B8A8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_UINT
>::Store
;
2452 table
[TTileMode
][R16G16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_UNORM
>::Store
;
2453 table
[TTileMode
][R16G16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SNORM
>::Store
;
2454 table
[TTileMode
][R16G16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SINT
>::Store
;
2455 table
[TTileMode
][R16G16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_UINT
>::Store
;
2456 table
[TTileMode
][R16G16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_FLOAT
>::Store
;
2457 table
[TTileMode
][B10G10R10A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UNORM
>::StoreGeneric
;
2458 table
[TTileMode
][B10G10R10A2_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UNORM_SRGB
>::StoreGeneric
;
2459 table
[TTileMode
][R11G11B10_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R11G11B10_FLOAT
>::StoreGeneric
;
2460 table
[TTileMode
][R10G10B10_FLOAT_A2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10_FLOAT_A2_UNORM
>::StoreGeneric
;
2461 table
[TTileMode
][R32_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_SINT
>::Store
;
2462 table
[TTileMode
][R32_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_UINT
>::Store
;
2463 table
[TTileMode
][R32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_FLOAT
>::Store
;
2464 table
[TTileMode
][R24_UNORM_X8_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R24_UNORM_X8_TYPELESS
>::StoreGeneric
;
2465 table
[TTileMode
][X24_TYPELESS_G8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, X24_TYPELESS_G8_UINT
>::StoreGeneric
;
2466 table
[TTileMode
][A32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, A32_FLOAT
>::Store
;
2467 table
[TTileMode
][B8G8R8X8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8X8_UNORM
>::Store
;
2468 table
[TTileMode
][B8G8R8X8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B8G8R8X8_UNORM_SRGB
>::Store
;
2469 table
[TTileMode
][R8G8B8X8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8X8_UNORM
>::Store
;
2470 table
[TTileMode
][R8G8B8X8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8X8_UNORM_SRGB
>::Store
;
2473 template <SWR_TILE_MODE TTileMode
, size_t NumTileModesT
, size_t ArraySizeT
>
2474 void InitStoreTilesTableColor_Half2(
2475 PFN_STORE_TILES(&table
)[NumTileModesT
][ArraySizeT
])
2477 table
[TTileMode
][R9G9B9E5_SHAREDEXP
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R9G9B9E5_SHAREDEXP
>::StoreGeneric
;
2478 table
[TTileMode
][B10G10R10X2_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10X2_UNORM
>::StoreGeneric
;
2479 table
[TTileMode
][R10G10B10X2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10X2_USCALED
>::StoreGeneric
;
2480 table
[TTileMode
][R8G8B8A8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_SSCALED
>::Store
;
2481 table
[TTileMode
][R8G8B8A8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R8G8B8A8_USCALED
>::Store
;
2482 table
[TTileMode
][R16G16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_SSCALED
>::Store
;
2483 table
[TTileMode
][R16G16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R16G16_USCALED
>::Store
;
2484 table
[TTileMode
][R32_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_SSCALED
>::Store
;
2485 table
[TTileMode
][R32_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R32_USCALED
>::Store
;
2486 table
[TTileMode
][B5G6R5_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G6R5_UNORM
>::Store
;
2487 table
[TTileMode
][B5G6R5_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G6R5_UNORM_SRGB
>::StoreGeneric
;
2488 table
[TTileMode
][B5G5R5A1_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5A1_UNORM
>::StoreGeneric
;
2489 table
[TTileMode
][B5G5R5A1_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5A1_UNORM_SRGB
>::StoreGeneric
;
2490 table
[TTileMode
][B4G4R4A4_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B4G4R4A4_UNORM
>::StoreGeneric
;
2491 table
[TTileMode
][B4G4R4A4_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B4G4R4A4_UNORM_SRGB
>::StoreGeneric
;
2492 table
[TTileMode
][R8G8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_UNORM
>::Store
;
2493 table
[TTileMode
][R8G8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SNORM
>::Store
;
2494 table
[TTileMode
][R8G8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SINT
>::Store
;
2495 table
[TTileMode
][R8G8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_UINT
>::Store
;
2496 table
[TTileMode
][R16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_UNORM
>::Store
;
2497 table
[TTileMode
][R16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SNORM
>::Store
;
2498 table
[TTileMode
][R16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SINT
>::Store
;
2499 table
[TTileMode
][R16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_UINT
>::Store
;
2500 table
[TTileMode
][R16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_FLOAT
>::Store
;
2501 table
[TTileMode
][A16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A16_UNORM
>::Store
;
2502 table
[TTileMode
][A16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A16_FLOAT
>::Store
;
2503 table
[TTileMode
][B5G5R5X1_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5X1_UNORM
>::StoreGeneric
;
2504 table
[TTileMode
][B5G5R5X1_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, B5G5R5X1_UNORM_SRGB
>::StoreGeneric
;
2505 table
[TTileMode
][R8G8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_SSCALED
>::Store
;
2506 table
[TTileMode
][R8G8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R8G8_USCALED
>::Store
;
2507 table
[TTileMode
][R16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_SSCALED
>::Store
;
2508 table
[TTileMode
][R16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, R16_USCALED
>::Store
;
2509 table
[TTileMode
][A1B5G5R5_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A1B5G5R5_UNORM
>::StoreGeneric
;
2510 table
[TTileMode
][A4B4G4R4_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32G32B32A32_FLOAT
, A4B4G4R4_UNORM
>::StoreGeneric
;
2511 table
[TTileMode
][R8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_UNORM
>::Store
;
2512 table
[TTileMode
][R8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SNORM
>::Store
;
2513 table
[TTileMode
][R8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SINT
>::Store
;
2514 table
[TTileMode
][R8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_UINT
>::Store
;
2515 table
[TTileMode
][A8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, A8_UNORM
>::Store
;
2516 table
[TTileMode
][R8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_SSCALED
>::Store
;
2517 table
[TTileMode
][R8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R32G32B32A32_FLOAT
, R8_USCALED
>::Store
;
2518 table
[TTileMode
][R8G8B8_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UNORM
>::Store
;
2519 table
[TTileMode
][R8G8B8_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SNORM
>::Store
;
2520 table
[TTileMode
][R8G8B8_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SSCALED
>::Store
;
2521 table
[TTileMode
][R8G8B8_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_USCALED
>::Store
;
2522 table
[TTileMode
][R16G16B16_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_FLOAT
>::Store
;
2523 table
[TTileMode
][R16G16B16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_UNORM
>::Store
;
2524 table
[TTileMode
][R16G16B16_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SNORM
>::Store
;
2525 table
[TTileMode
][R16G16B16_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SSCALED
>::Store
;
2526 table
[TTileMode
][R16G16B16_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_USCALED
>::Store
;
2527 table
[TTileMode
][R8G8B8_UNORM_SRGB
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UNORM_SRGB
>::Store
;
2528 table
[TTileMode
][R16G16B16_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_UINT
>::Store
;
2529 table
[TTileMode
][R16G16B16_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 48>, R32G32B32A32_FLOAT
, R16G16B16_SINT
>::Store
;
2530 table
[TTileMode
][R10G10B10A2_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SNORM
>::StoreGeneric
;
2531 table
[TTileMode
][R10G10B10A2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_USCALED
>::StoreGeneric
;
2532 table
[TTileMode
][R10G10B10A2_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SSCALED
>::StoreGeneric
;
2533 table
[TTileMode
][R10G10B10A2_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, R10G10B10A2_SINT
>::StoreGeneric
;
2534 table
[TTileMode
][B10G10R10A2_SNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SNORM
>::StoreGeneric
;
2535 table
[TTileMode
][B10G10R10A2_USCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_USCALED
>::StoreGeneric
;
2536 table
[TTileMode
][B10G10R10A2_SSCALED
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SSCALED
>::StoreGeneric
;
2537 table
[TTileMode
][B10G10R10A2_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_UINT
>::StoreGeneric
;
2538 table
[TTileMode
][B10G10R10A2_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32G32B32A32_FLOAT
, B10G10R10A2_SINT
>::StoreGeneric
;
2539 table
[TTileMode
][R8G8B8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_UINT
>::Store
;
2540 table
[TTileMode
][R8G8B8_SINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 24>, R32G32B32A32_FLOAT
, R8G8B8_SINT
>::Store
;
2543 //////////////////////////////////////////////////////////////////////////
2544 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2545 template <SWR_TILE_MODE TTileMode
, size_t NumTileModes
, size_t ArraySizeT
>
2546 void InitStoreTilesTableDepth(
2547 PFN_STORE_TILES(&table
)[NumTileModes
][ArraySizeT
])
2549 table
[TTileMode
][R32_FLOAT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32_FLOAT
, R32_FLOAT
>::Store
;
2550 table
[TTileMode
][R32_FLOAT_X8X24_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 64>, R32_FLOAT
, R32_FLOAT_X8X24_TYPELESS
>::Store
;
2551 table
[TTileMode
][R24_UNORM_X8_TYPELESS
] = StoreMacroTile
<TilingTraits
<TTileMode
, 32>, R32_FLOAT
, R24_UNORM_X8_TYPELESS
>::Store
;
2552 table
[TTileMode
][R16_UNORM
] = StoreMacroTile
<TilingTraits
<TTileMode
, 16>, R32_FLOAT
, R16_UNORM
>::Store
;
2555 template <SWR_TILE_MODE TTileMode
, size_t NumTileModes
, size_t ArraySizeT
>
2556 void InitStoreTilesTableStencil(
2557 PFN_STORE_TILES(&table
)[NumTileModes
][ArraySizeT
])
2559 table
[TTileMode
][R8_UINT
] = StoreMacroTile
<TilingTraits
<TTileMode
, 8>, R8_UINT
, R8_UINT
>::Store
;