swr/rast: Refactor memory API between rasterizer core and swr
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "memory/SurfaceState.h"
39 #include "core/multisample.h"
40
41 #include <array>
42 #include <sstream>
43
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
45
46 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
47 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
48
49 //////////////////////////////////////////////////////////////////////////
50 /// Store Raster Tile Function Tables.
51 //////////////////////////////////////////////////////////////////////////
52 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
54 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
55
56 void InitStoreTilesTable_Linear_1();
57 void InitStoreTilesTable_Linear_2();
58 void InitStoreTilesTable_TileX_1();
59 void InitStoreTilesTable_TileX_2();
60 void InitStoreTilesTable_TileY_1();
61 void InitStoreTilesTable_TileY_2();
62 void InitStoreTilesTable_TileW();
63 void InitStoreTilesTable();
64
65 //////////////////////////////////////////////////////////////////////////
66 /// StorePixels
67 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
68 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
69 /// @param ppDsts - Array of destination pointers. Each pointer is
70 /// to a single row of at most 16B.
71 /// @tparam NumDests - Number of destination pointers. Each pair of
72 /// pointers is for a 16-byte column of two rows.
73 //////////////////////////////////////////////////////////////////////////
74 template <size_t PixelSize, size_t NumDests>
75 struct StorePixels
76 {
77 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
78 };
79
80 //////////////////////////////////////////////////////////////////////////
81 /// StorePixels (32-bit pixel specialization)
82 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
83 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
84 /// @param ppDsts - Array of destination pointers. Each pointer is
85 /// to a single row of at most 16B.
86 /// @tparam NumDests - Number of destination pointers. Each pair of
87 /// pointers is for a 16-byte column of two rows.
88 //////////////////////////////////////////////////////////////////////////
89 template <>
90 struct StorePixels<8, 2>
91 {
92 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
93 {
94 // Each 4-pixel row is 4 bytes.
95 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
96
97 // Unswizzle from SWR-Z order
98 uint16_t* pRow = (uint16_t*)ppDsts[0];
99 pRow[0] = pPixSrc[0];
100 pRow[1] = pPixSrc[2];
101
102 pRow = (uint16_t*)ppDsts[1];
103 pRow[0] = pPixSrc[1];
104 pRow[1] = pPixSrc[3];
105 }
106 };
107
108 template <>
109 struct StorePixels<8, 4>
110 {
111 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
112 {
113 // 8 x 2 bytes = 16 bytes, 16 pixels
114 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
115
116 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
117
118 // Unswizzle from SWR-Z order
119 ppDsts16[0][0] = pSrc16[0]; // 0 1
120 ppDsts16[0][1] = pSrc16[2]; // 4 5
121
122 ppDsts16[1][0] = pSrc16[1]; // 2 3
123 ppDsts16[1][1] = pSrc16[3]; // 6 7
124
125 ppDsts16[2][0] = pSrc16[4]; // 8 9
126 ppDsts16[2][1] = pSrc16[6]; // C D
127
128 ppDsts16[3][0] = pSrc16[5]; // A B
129 ppDsts16[3][1] = pSrc16[7]; // E F
130 }
131 };
132
133 //////////////////////////////////////////////////////////////////////////
134 /// StorePixels (32-bit pixel specialization)
135 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
136 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
137 /// @param ppDsts - Array of destination pointers. Each pointer is
138 /// to a single row of at most 16B.
139 /// @tparam NumDests - Number of destination pointers. Each pair of
140 /// pointers is for a 16-byte column of two rows.
141 //////////////////////////////////////////////////////////////////////////
142 template <>
143 struct StorePixels<16, 2>
144 {
145 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
146 {
147 // Each 4-pixel row is 8 bytes.
148 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
149
150 // Unswizzle from SWR-Z order
151 uint32_t* pRow = (uint32_t*)ppDsts[0];
152 pRow[0] = pPixSrc[0];
153 pRow[1] = pPixSrc[2];
154
155 pRow = (uint32_t*)ppDsts[1];
156 pRow[0] = pPixSrc[1];
157 pRow[1] = pPixSrc[3];
158 }
159 };
160
161 template <>
162 struct StorePixels<16, 4>
163 {
164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165 {
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171 // Unswizzle from SWR-Z order
172 ppDsts32[0][0] = pSrc32[0]; // 0 1
173 ppDsts32[0][1] = pSrc32[2]; // 4 5
174
175 ppDsts32[1][0] = pSrc32[1]; // 2 3
176 ppDsts32[1][1] = pSrc32[3]; // 6 7
177
178 ppDsts32[2][0] = pSrc32[4]; // 8 9
179 ppDsts32[2][1] = pSrc32[6]; // C D
180
181 ppDsts32[3][0] = pSrc32[5]; // A B
182 ppDsts32[3][1] = pSrc32[7]; // E F
183 }
184 };
185
186 //////////////////////////////////////////////////////////////////////////
187 /// StorePixels (32-bit pixel specialization)
188 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
189 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
190 /// @param ppDsts - Array of destination pointers. Each pointer is
191 /// to a single row of at most 16B.
192 /// @tparam NumDests - Number of destination pointers. Each pair of
193 /// pointers is for a 16-byte column of two rows.
194 //////////////////////////////////////////////////////////////////////////
195 template <>
196 struct StorePixels<32, 2>
197 {
198 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
199 {
200 // Each 4-pixel row is 16-bytes
201 simd4scalari *pZRow01 = (simd4scalari*)pSrc;
202 simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
203 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
204
205 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
206 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
207
208 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
209 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
210 }
211 };
212
213 template <>
214 struct StorePixels<32, 4>
215 {
216 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
217 {
218 // 4 x 16 bytes = 64 bytes, 16 pixels
219 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
220
221 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
222
223 // Unswizzle from SWR-Z order
224 simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
225 simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
226 simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
227 simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
228
229 SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
230 SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
231 SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
232 SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
233 }
234 };
235
236 //////////////////////////////////////////////////////////////////////////
237 /// StorePixels (32-bit pixel specialization)
238 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
239 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
240 /// @param ppDsts - Array of destination pointers. Each pointer is
241 /// to a single row of at most 16B.
242 /// @tparam NumDests - Number of destination pointers. Each pair of
243 /// pointers is for a 16-byte column of two rows.
244 //////////////////////////////////////////////////////////////////////////
245 template <>
246 struct StorePixels<64, 4>
247 {
248 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
249 {
250 // Each 4-pixel row is 32 bytes.
251 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
252
253 // order of pointers match SWR-Z layout
254 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
255 *pvDsts[0] = pPixSrc[0];
256 *pvDsts[1] = pPixSrc[1];
257 *pvDsts[2] = pPixSrc[2];
258 *pvDsts[3] = pPixSrc[3];
259 }
260 };
261
262 template <>
263 struct StorePixels<64, 8>
264 {
265 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
266 {
267 // 8 x 16 bytes = 128 bytes, 16 pixels
268 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
269
270 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
271
272 // order of pointers match SWR-Z layout
273 *ppDsts128[0] = pSrc128[0]; // 0 1
274 *ppDsts128[1] = pSrc128[1]; // 2 3
275 *ppDsts128[2] = pSrc128[2]; // 4 5
276 *ppDsts128[3] = pSrc128[3]; // 6 7
277 *ppDsts128[4] = pSrc128[4]; // 8 9
278 *ppDsts128[5] = pSrc128[5]; // A B
279 *ppDsts128[6] = pSrc128[6]; // C D
280 *ppDsts128[7] = pSrc128[7]; // E F
281 }
282 };
283
284 //////////////////////////////////////////////////////////////////////////
285 /// StorePixels (32-bit pixel specialization)
286 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
287 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
288 /// @param ppDsts - Array of destination pointers. Each pointer is
289 /// to a single row of at most 16B.
290 /// @tparam NumDests - Number of destination pointers. Each pair of
291 /// pointers is for a 16-byte column of two rows.
292 //////////////////////////////////////////////////////////////////////////
293 template <>
294 struct StorePixels<128, 8>
295 {
296 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
297 {
298 // Each 4-pixel row is 64 bytes.
299 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
300
301 // Unswizzle from SWR-Z order
302 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
303 *pvDsts[0] = pPixSrc[0];
304 *pvDsts[1] = pPixSrc[2];
305 *pvDsts[2] = pPixSrc[1];
306 *pvDsts[3] = pPixSrc[3];
307 *pvDsts[4] = pPixSrc[4];
308 *pvDsts[5] = pPixSrc[6];
309 *pvDsts[6] = pPixSrc[5];
310 *pvDsts[7] = pPixSrc[7];
311 }
312 };
313
314 template <>
315 struct StorePixels<128, 16>
316 {
317 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
318 {
319 // 16 x 16 bytes = 256 bytes, 16 pixels
320 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
321
322 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
323
324 for (uint32_t i = 0; i < 16; i += 4)
325 {
326 *ppDsts128[i + 0] = pSrc128[i + 0];
327 *ppDsts128[i + 1] = pSrc128[i + 2];
328 *ppDsts128[i + 2] = pSrc128[i + 1];
329 *ppDsts128[i + 3] = pSrc128[i + 3];
330 }
331 }
332 };
333
334 //////////////////////////////////////////////////////////////////////////
335 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
336 //////////////////////////////////////////////////////////////////////////
337 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
338 struct ConvertPixelsSOAtoAOS
339 {
340 //////////////////////////////////////////////////////////////////////////
341 /// @brief Converts a SIMD from the Hot Tile to the destination format
342 /// and converts from SOA to AOS.
343 /// @param pSrc - Pointer to raster tile.
344 /// @param pDst - Pointer to destination surface or deswizzling buffer.
345 template <size_t NumDests>
346 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
347 {
348 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
349
350 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
351 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
352
353 // Convert from SrcFormat --> DstFormat
354 simd16vector src;
355 LoadSOA<SrcFormat>(pSrc, src);
356 StoreSOA<DstFormat>(src, soaTile);
357
358 // Convert from SOA --> AOS
359 FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile);
360
361 // Store data into destination
362 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
363 }
364 };
365
366 //////////////////////////////////////////////////////////////////////////
367 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
368 /// Specialization for no format conversion
369 //////////////////////////////////////////////////////////////////////////
370 template<SWR_FORMAT Format>
371 struct ConvertPixelsSOAtoAOS<Format, Format>
372 {
373 //////////////////////////////////////////////////////////////////////////
374 /// @brief Converts a SIMD from the Hot Tile to the destination format
375 /// and converts from SOA to AOS.
376 /// @param pSrc - Pointer to raster tile.
377 /// @param pDst - Pointer to destination surface or deswizzling buffer.
378 template <size_t NumDests>
379 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
380 {
381 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
382
383 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
384
385 // Convert from SOA --> AOS
386 FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile);
387
388 // Store data into destination
389 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
390 }
391 };
392
393 //////////////////////////////////////////////////////////////////////////
394 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
395 //////////////////////////////////////////////////////////////////////////
396 template<>
397 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
398 {
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Converts a SIMD from the Hot Tile to the destination format
401 /// and converts from SOA to AOS.
402 /// @param pSrc - Pointer to raster tile.
403 /// @param pDst - Pointer to destination surface or deswizzling buffer.
404 template <size_t NumDests>
405 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
406 {
407 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
408 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
409
410 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
411
412 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
413
414 // Load hot-tile
415 simd16vector src, dst;
416 LoadSOA<SrcFormat>(pSrc, src);
417
418 // deswizzle
419 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
420 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
421 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
422
423 // clamp
424 dst.x = Clamp<DstFormat>(dst.x, 0);
425 dst.y = Clamp<DstFormat>(dst.y, 1);
426 dst.z = Clamp<DstFormat>(dst.z, 2);
427
428 // normalize
429 dst.x = Normalize<DstFormat>(dst.x, 0);
430 dst.y = Normalize<DstFormat>(dst.y, 1);
431 dst.z = Normalize<DstFormat>(dst.z, 2);
432
433 // pack
434 simd16scalari packed = _simd16_castps_si(dst.x);
435
436 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
437 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
438
439 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
440 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
441
442 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
443 uint32_t *pPacked = (uint32_t*)&packed;
444 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
445 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
446 {
447 *pAosTile++ = *pPacked++;
448 }
449
450 // Store data into destination
451 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
452 }
453 };
454
455 //////////////////////////////////////////////////////////////////////////
456 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
457 //////////////////////////////////////////////////////////////////////////
458 template<>
459 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
460 {
461 static const SWR_FORMAT SrcFormat = R32_FLOAT;
462 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
463
464 //////////////////////////////////////////////////////////////////////////
465 /// @brief Converts a SIMD from the Hot Tile to the destination format
466 /// and converts from SOA to AOS.
467 /// @param pSrc - Pointer to raster tile.
468 /// @param pDst - Pointer to destination surface or deswizzling buffer.
469 template <size_t NumDests>
470 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
471 {
472 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
473
474 // clamp
475 const simd16scalar zero = _simd16_setzero_ps();
476 const simd16scalar ones = _simd16_set1_ps(1.0f);
477
478 comp = _simd16_max_ps(comp, zero);
479 comp = _simd16_min_ps(comp, ones);
480
481 // normalize
482 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
483
484 simd16scalari temp = _simd16_cvtps_epi32(comp);
485
486 // swizzle
487 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
488
489 // merge/store data into destination but don't overwrite the X8 bits
490 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
491 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
492
493 simd16scalari dest = _simd16_setzero_si();
494
495 dest = _simd16_insert_si(dest, destlo, 0);
496 dest = _simd16_insert_si(dest, desthi, 1);
497
498 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
499
500 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
501
502 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
503 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
504 }
505 };
506
507 template<SWR_FORMAT DstFormat>
508 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
509 {
510 // swizzle rgba -> bgra while we load
511 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
512 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
513 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
514 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
515
516 // clamp
517 const simd16scalar zero = _simd16_setzero_ps();
518 const simd16scalar ones = _simd16_set1_ps(1.0f);
519
520 comp0 = _simd16_max_ps(comp0, zero);
521 comp0 = _simd16_min_ps(comp0, ones);
522
523 comp1 = _simd16_max_ps(comp1, zero);
524 comp1 = _simd16_min_ps(comp1, ones);
525
526 comp2 = _simd16_max_ps(comp2, zero);
527 comp2 = _simd16_min_ps(comp2, ones);
528
529 comp3 = _simd16_max_ps(comp3, zero);
530 comp3 = _simd16_min_ps(comp3, ones);
531
532 // gamma-correct only rgb
533 if (FormatTraits<DstFormat>::isSRGB)
534 {
535 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
536 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
537 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
538 }
539
540 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
541 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
542 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
543 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
544 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
545
546 // moving to 16 wide integer vector types
547 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
548 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
549 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
550 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
551
552 // SOA to AOS conversion
553 src1 = _simd16_slli_epi32(src1, 8);
554 src2 = _simd16_slli_epi32(src2, 16);
555 src3 = _simd16_slli_epi32(src3, 24);
556
557 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
558
559 // de-swizzle conversion
560 #if 1
561 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
562 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
563
564 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
565
566 #else
567 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
568
569 #endif
570 // store 8x2 memory order:
571 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
572 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
573 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
574 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
575 }
576
577 template<SWR_FORMAT DstFormat>
578 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
579 {
580 static const uint32_t offset = sizeof(simdscalar);
581
582 // swizzle rgba -> bgra while we load
583 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
584 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
585 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
586 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
587
588 // clamp
589 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
590 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
591
592 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
593 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
594
595 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
596 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
597
598 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
599 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
600
601 if (FormatTraits<DstFormat>::isSRGB)
602 {
603 // Gamma-correct only rgb
604 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
605 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
606 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
607 }
608
609 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
610 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
611 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
612 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
613 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
614
615 // moving to 8 wide integer vector types
616 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
617 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
618 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
619 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
620
621 #if KNOB_ARCH <= KNOB_ARCH_AVX
622
623 // splitting into two sets of 4 wide integer vector types
624 // because AVX doesn't have instructions to support this operation at 8 wide
625 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
626 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
627 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
628 simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
629
630 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
631 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
632 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
633 simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
634
635 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
636 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
637 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
638 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
639 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
640 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
641
642 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
643 srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
644
645 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
646 srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
647
648 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
649 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
650
651 // unpack into rows that get the tiling order correct
652 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
653 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
654
655 simdscalari final = _mm256_castsi128_si256(vRow00);
656 final = _mm256_insertf128_si256(final, vRow10, 1);
657
658 #else
659
660 // logic is as above, only wider
661 src1 = _mm256_slli_si256(src1, 1);
662 src2 = _mm256_slli_si256(src2, 2);
663 src3 = _mm256_slli_si256(src3, 3);
664
665 src0 = _mm256_or_si256(src0, src1);
666 src2 = _mm256_or_si256(src2, src3);
667
668 simdscalari final = _mm256_or_si256(src0, src2);
669
670 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
671 final = _mm256_permute4x64_epi64(final, 0xD8);
672 #endif
673
674 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
675 }
676
677 template<SWR_FORMAT DstFormat>
678 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
679 {
680 // swizzle rgba -> bgra while we load
681 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
682 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
683 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
684
685 // clamp
686 const simd16scalar zero = _simd16_setzero_ps();
687 const simd16scalar ones = _simd16_set1_ps(1.0f);
688
689 comp0 = _simd16_max_ps(comp0, zero);
690 comp0 = _simd16_min_ps(comp0, ones);
691
692 comp1 = _simd16_max_ps(comp1, zero);
693 comp1 = _simd16_min_ps(comp1, ones);
694
695 comp2 = _simd16_max_ps(comp2, zero);
696 comp2 = _simd16_min_ps(comp2, ones);
697
698 // gamma-correct only rgb
699 if (FormatTraits<DstFormat>::isSRGB)
700 {
701 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
702 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
703 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
704 }
705
706 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
707 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
708 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
709 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
710
711 // moving to 16 wide integer vector types
712 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
713 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
714 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
715
716 // SOA to AOS conversion
717 src1 = _simd16_slli_epi32(src1, 8);
718 src2 = _simd16_slli_epi32(src2, 16);
719
720 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
721
722 // de-swizzle conversion
723 #if 1
724 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
725 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
726
727 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
728
729 #else
730 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
731
732 #endif
733 // store 8x2 memory order:
734 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
735 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
736 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
737 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
738 }
739
740 template<SWR_FORMAT DstFormat>
741 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
742 {
743 static const uint32_t offset = sizeof(simdscalar);
744
745 // swizzle rgba -> bgra while we load
746 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
747 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
748 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
749 // clamp
750 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
751 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
752
753 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
754 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
755
756 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
757 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
758
759 if (FormatTraits<DstFormat>::isSRGB)
760 {
761 // Gamma-correct only rgb
762 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
763 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
764 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
765 }
766
767 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
768 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
769 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
770 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
771
772 // moving to 8 wide integer vector types
773 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
774 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
775 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
776
777 #if KNOB_ARCH <= KNOB_ARCH_AVX
778
779 // splitting into two sets of 4 wide integer vector types
780 // because AVX doesn't have instructions to support this operation at 8 wide
781 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
782 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
783 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
784
785 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
786 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
787 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
788
789 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
790 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
791 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
792 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
793
794 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
795
796 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
797
798 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
799 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
800
801 // unpack into rows that get the tiling order correct
802 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
803 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
804
805 simdscalari final = _mm256_castsi128_si256(vRow00);
806 final = _mm256_insertf128_si256(final, vRow10, 1);
807
808 #else
809
810 // logic is as above, only wider
811 src1 = _mm256_slli_si256(src1, 1);
812 src2 = _mm256_slli_si256(src2, 2);
813
814 src0 = _mm256_or_si256(src0, src1);
815
816 simdscalari final = _mm256_or_si256(src0, src2);
817
818 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
819 final = _mm256_permute4x64_epi64(final, 0xD8);
820
821 #endif
822
823 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
824 }
825
826 template<>
827 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
828 {
829 template <size_t NumDests>
830 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
831 {
832 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
833 }
834 };
835
836 template<>
837 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
838 {
839 template <size_t NumDests>
840 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
841 {
842 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
843 }
844 };
845
846 template<>
847 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
848 {
849 template <size_t NumDests>
850 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
851 {
852 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
853 }
854 };
855
856 template<>
857 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
858 {
859 template <size_t NumDests>
860 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
861 {
862 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
863 }
864 };
865
866 template<>
867 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
868 {
869 template <size_t NumDests>
870 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
871 {
872 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
873 }
874 };
875
876 template<>
877 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
878 {
879 template <size_t NumDests>
880 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
881 {
882 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
883 }
884 };
885
886 template<>
887 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
888 {
889 template <size_t NumDests>
890 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
891 {
892 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
893 }
894 };
895
896 template<>
897 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
898 {
899 template <size_t NumDests>
900 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
901 {
902 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
903 }
904 };
905
906 //////////////////////////////////////////////////////////////////////////
907 /// StoreRasterTile
908 //////////////////////////////////////////////////////////////////////////
909 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
910 struct StoreRasterTile
911 {
912 //////////////////////////////////////////////////////////////////////////
913 /// @brief Retrieve color from hot tile source which is always float.
914 /// @param pSrc - Pointer to raster tile.
915 /// @param x, y - Coordinates to raster tile.
916 /// @param output - output color
917 INLINE static void GetSwizzledSrcColor(
918 uint8_t* pSrc,
919 uint32_t x, uint32_t y,
920 float outputColor[4])
921 {
922 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
923
924 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
925
926 // Compute which simd tile we're accessing within 8x8 tile.
927 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
928 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
929
930 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
931
932 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
933
934 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
935 }
936
937 //////////////////////////////////////////////////////////////////////////
938 /// @brief Stores an 8x8 raster tile to the destination surface.
939 /// @param pSrc - Pointer to raster tile.
940 /// @param pDstSurface - Destination surface state
941 /// @param x, y - Coordinates to raster tile.
942 INLINE static void Store(
943 uint8_t *pSrc,
944 SWR_SURFACE_STATE* pDstSurface,
945 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
946 {
947 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
948 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
949
950 // For each raster tile pixel (rx, ry)
951 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
952 {
953 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
954 {
955 // Perform bounds checking.
956 if (((x + rx) < lodWidth) &&
957 ((y + ry) < lodHeight))
958 {
959 float srcColor[4];
960 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
961
962 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
963 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
964 sampleNum, pDstSurface->lod, pDstSurface);
965 {
966 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
967 }
968 }
969 }
970 }
971 }
972
973 //////////////////////////////////////////////////////////////////////////
974 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
975 /// @param pSrc - Pointer to raster tile.
976 /// @param pDstSurface - Destination surface state
977 /// @param x, y - Coordinates to raster tile.
978 /// @param sampleOffset - Offset between adjacent multisamples
979 INLINE static void Resolve(
980 uint8_t *pSrc,
981 SWR_SURFACE_STATE* pDstSurface,
982 uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
983 {
984 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
985 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
986
987 float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
988
989 // For each raster tile pixel (rx, ry)
990 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
991 {
992 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
993 {
994 // Perform bounds checking.
995 if (((x + rx) < lodWidth) &&
996 ((y + ry) < lodHeight))
997 {
998 // Sum across samples
999 float resolveColor[4] = {0};
1000 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1001 {
1002 float sampleColor[4] = {0};
1003 uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1004 GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1005 resolveColor[0] += sampleColor[0];
1006 resolveColor[1] += sampleColor[1];
1007 resolveColor[2] += sampleColor[2];
1008 resolveColor[3] += sampleColor[3];
1009 }
1010
1011 // Divide by numSamples to average
1012 resolveColor[0] *= oneOverNumSamples;
1013 resolveColor[1] *= oneOverNumSamples;
1014 resolveColor[2] *= oneOverNumSamples;
1015 resolveColor[3] *= oneOverNumSamples;
1016
1017 // Use the resolve surface state
1018 SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1019 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1020 pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1021 0, pResolveSurface->lod, pResolveSurface);
1022 {
1023 ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1024 }
1025 }
1026 }
1027 }
1028 }
1029
1030 };
1031
1032 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1033 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1034 {};
1035
1036 //////////////////////////////////////////////////////////////////////////
1037 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1038 //////////////////////////////////////////////////////////////////////////
1039 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1040 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1041 {
1042 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1043 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1044 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1045
1046 //////////////////////////////////////////////////////////////////////////
1047 /// @brief Stores an 8x8 raster tile to the destination surface.
1048 /// @param pSrc - Pointer to raster tile.
1049 /// @param pDstSurface - Destination surface state
1050 /// @param x, y - Coordinates to raster tile.
1051 INLINE static void Store(
1052 uint8_t *pSrc,
1053 SWR_SURFACE_STATE* pDstSurface,
1054 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1055 {
1056 // Punt non-full tiles to generic store
1057 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1058 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1059
1060 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1061 {
1062 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1063 }
1064
1065 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1066 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1067
1068 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1069 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1070
1071 uint8_t* ppDsts[] =
1072 {
1073 pDst, // row 0, col 0
1074 pDst + pDstSurface->pitch, // row 1, col 0
1075 pDst + dx / 2, // row 0, col 1
1076 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1077 };
1078
1079 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1080 {
1081 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1082 {
1083 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1084
1085 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1086
1087 ppDsts[0] += dx;
1088 ppDsts[1] += dx;
1089 ppDsts[2] += dx;
1090 ppDsts[3] += dx;
1091 }
1092
1093 ppDsts[0] += dy;
1094 ppDsts[1] += dy;
1095 ppDsts[2] += dy;
1096 ppDsts[3] += dy;
1097 }
1098 }
1099 };
1100
1101 //////////////////////////////////////////////////////////////////////////
1102 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1103 //////////////////////////////////////////////////////////////////////////
1104 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1105 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1106 {
1107 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1108 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1109 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1110
1111 //////////////////////////////////////////////////////////////////////////
1112 /// @brief Stores an 8x8 raster tile to the destination surface.
1113 /// @param pSrc - Pointer to raster tile.
1114 /// @param pDstSurface - Destination surface state
1115 /// @param x, y - Coordinates to raster tile.
1116 INLINE static void Store(
1117 uint8_t *pSrc,
1118 SWR_SURFACE_STATE* pDstSurface,
1119 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1120 {
1121 // Punt non-full tiles to generic store
1122 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1123 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1124
1125 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1126 {
1127 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1128 }
1129
1130 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1131 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1132
1133 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1134 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1135
1136 uint8_t* ppDsts[] =
1137 {
1138 pDst, // row 0, col 0
1139 pDst + pDstSurface->pitch, // row 1, col 0
1140 pDst + dx / 2, // row 0, col 1
1141 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1142 };
1143
1144 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1145 {
1146 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1147 {
1148 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1149
1150 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1151
1152 ppDsts[0] += dx;
1153 ppDsts[1] += dx;
1154 ppDsts[2] += dx;
1155 ppDsts[3] += dx;
1156 }
1157
1158 ppDsts[0] += dy;
1159 ppDsts[1] += dy;
1160 ppDsts[2] += dy;
1161 ppDsts[3] += dy;
1162 }
1163 }
1164 };
1165
1166 //////////////////////////////////////////////////////////////////////////
1167 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1168 //////////////////////////////////////////////////////////////////////////
1169 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1170 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1171 {
1172 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1173 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1174 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1175
1176 //////////////////////////////////////////////////////////////////////////
1177 /// @brief Stores an 8x8 raster tile to the destination surface.
1178 /// @param pSrc - Pointer to raster tile.
1179 /// @param pDstSurface - Destination surface state
1180 /// @param x, y - Coordinates to raster tile.
1181 INLINE static void Store(
1182 uint8_t *pSrc,
1183 SWR_SURFACE_STATE* pDstSurface,
1184 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1185 {
1186 // Punt non-full tiles to generic store
1187 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1188 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1189
1190 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1191 {
1192 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1193 }
1194
1195 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1196 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1197
1198 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1199 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1200
1201 uint8_t* ppDsts[] =
1202 {
1203 pDst, // row 0, col 0
1204 pDst + pDstSurface->pitch, // row 1, col 0
1205 pDst + dx / 2, // row 0, col 1
1206 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1207 };
1208
1209 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1210 {
1211 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1212 {
1213 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1214
1215 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1216
1217 ppDsts[0] += dx;
1218 ppDsts[1] += dx;
1219 ppDsts[2] += dx;
1220 ppDsts[3] += dx;
1221 }
1222
1223 ppDsts[0] += dy;
1224 ppDsts[1] += dy;
1225 ppDsts[2] += dy;
1226 ppDsts[3] += dy;
1227 }
1228 }
1229 };
1230
1231 //////////////////////////////////////////////////////////////////////////
1232 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1233 //////////////////////////////////////////////////////////////////////////
1234 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1235 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1236 {
1237 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1238 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1239 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1240 static const size_t MAX_DST_COLUMN_BYTES = 16;
1241
1242 //////////////////////////////////////////////////////////////////////////
1243 /// @brief Stores an 8x8 raster tile to the destination surface.
1244 /// @param pSrc - Pointer to raster tile.
1245 /// @param pDstSurface - Destination surface state
1246 /// @param x, y - Coordinates to raster tile.
1247 INLINE static void Store(
1248 uint8_t *pSrc,
1249 SWR_SURFACE_STATE* pDstSurface,
1250 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1251 {
1252 // Punt non-full tiles to generic store
1253 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1254 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1255
1256 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1257 {
1258 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1259 }
1260
1261 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1262 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1263
1264 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1265 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1266
1267 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1268 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1269
1270 uint8_t *ppDsts[] =
1271 {
1272 pDst, // row 0, col 0
1273 pDst + pDstSurface->pitch, // row 1, col 0
1274 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1275 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1276 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1277 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1278 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1279 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1280 };
1281
1282 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1283 {
1284 // Raster tile width is same as simd16 tile width
1285 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1286
1287 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1288
1289 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1290
1291 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1292 {
1293 ppDsts[i] += dy;
1294 }
1295 }
1296 }
1297 };
1298
1299 //////////////////////////////////////////////////////////////////////////
1300 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1301 //////////////////////////////////////////////////////////////////////////
1302 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1303 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1304 {
1305 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1306 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1307 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1308 static const size_t MAX_DST_COLUMN_BYTES = 16;
1309
1310 //////////////////////////////////////////////////////////////////////////
1311 /// @brief Stores an 8x8 raster tile to the destination surface.
1312 /// @param pSrc - Pointer to raster tile.
1313 /// @param pDstSurface - Destination surface state
1314 /// @param x, y - Coordinates to raster tile.
1315 INLINE static void Store(
1316 uint8_t *pSrc,
1317 SWR_SURFACE_STATE* pDstSurface,
1318 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1319 {
1320 // Punt non-full tiles to generic store
1321 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1322 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1323
1324 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1325 {
1326 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1327 }
1328
1329 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1330 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1331
1332 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1333 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1334
1335 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1336 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1337
1338 uint8_t* ppDsts[] =
1339 {
1340 pDst, // row 0, col 0
1341 pDst + pDstSurface->pitch, // row 1, col 0
1342 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1343 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1344 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1345 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1346 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1347 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1348 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1349 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1350 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1351 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1352 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1353 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1354 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1355 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1356 };
1357
1358 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1359 {
1360 // Raster tile width is same as simd16 tile width
1361 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1362
1363 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1364
1365 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1366
1367 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1368 {
1369 ppDsts[i] += dy;
1370 }
1371 }
1372 }
1373 };
1374
1375 //////////////////////////////////////////////////////////////////////////
1376 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1377 //////////////////////////////////////////////////////////////////////////
1378 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1379 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1380 {
1381 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1382 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1383
1384 //////////////////////////////////////////////////////////////////////////
1385 /// @brief Stores an 8x8 raster tile to the destination surface.
1386 /// @param pSrc - Pointer to raster tile.
1387 /// @param pDstSurface - Destination surface state
1388 /// @param x, y - Coordinates to raster tile.
1389 INLINE static void Store(
1390 uint8_t *pSrc,
1391 SWR_SURFACE_STATE* pDstSurface,
1392 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1393 {
1394 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1395
1396 // Punt non-full tiles to generic store
1397 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1398 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1399
1400 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1401 {
1402 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1403 }
1404
1405 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1406 // We can compute the offsets to each column within the raster tile once and increment from these.
1407 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1408 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1409 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1410
1411 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1412
1413 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1414 uint8_t *ppDsts[] =
1415 {
1416 pDst,
1417 pDst + DestRowWidthBytes,
1418 pDst + DestRowWidthBytes / 4,
1419 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1420 };
1421
1422 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1423 {
1424 // Raster tile width is same as simd16 tile width
1425 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1426
1427 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1428
1429 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1430
1431 ppDsts[0] += dy;
1432 ppDsts[1] += dy;
1433 ppDsts[2] += dy;
1434 ppDsts[3] += dy;
1435 }
1436 }
1437 };
1438
1439 //////////////////////////////////////////////////////////////////////////
1440 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1441 //////////////////////////////////////////////////////////////////////////
1442 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1443 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1444 {
1445 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1446 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1447
1448 //////////////////////////////////////////////////////////////////////////
1449 /// @brief Stores an 8x8 raster tile to the destination surface.
1450 /// @param pSrc - Pointer to raster tile.
1451 /// @param pDstSurface - Destination surface state
1452 /// @param x, y - Coordinates to raster tile.
1453 INLINE static void Store(
1454 uint8_t *pSrc,
1455 SWR_SURFACE_STATE* pDstSurface,
1456 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1457 {
1458 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1459
1460 // Punt non-full tiles to generic store
1461 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1462 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1463
1464 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1465 {
1466 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1467 }
1468
1469 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1470 // We can compute the offsets to each column within the raster tile once and increment from these.
1471 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1472 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1473 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1474
1475 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1476
1477 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1478 uint8_t *ppDsts[] =
1479 {
1480 pDst,
1481 pDst + DestRowWidthBytes,
1482 pDst + DestRowWidthBytes / 2,
1483 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1484 };
1485
1486 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1487 {
1488 // Raster tile width is same as simd16 tile width
1489 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1490
1491 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1492
1493 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1494
1495 ppDsts[0] += dy;
1496 ppDsts[1] += dy;
1497 ppDsts[2] += dy;
1498 ppDsts[3] += dy;
1499 }
1500 }
1501 };
1502
1503 //////////////////////////////////////////////////////////////////////////
1504 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1505 //////////////////////////////////////////////////////////////////////////
1506 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1507 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1508 {
1509 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1510 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1511 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1512
1513 //////////////////////////////////////////////////////////////////////////
1514 /// @brief Stores an 8x8 raster tile to the destination surface.
1515 /// @param pSrc - Pointer to raster tile.
1516 /// @param pDstSurface - Destination surface state
1517 /// @param x, y - Coordinates to raster tile.
1518 INLINE static void Store(
1519 uint8_t *pSrc,
1520 SWR_SURFACE_STATE* pDstSurface,
1521 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1522 {
1523 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1524
1525 // Punt non-full tiles to generic store
1526 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1527 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1528
1529 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1530 {
1531 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1532 }
1533
1534 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1535 // We can compute the offsets to each column within the raster tile once and increment from these.
1536 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1537 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1538
1539 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1540 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1541
1542 uint8_t* ppDsts[] =
1543 {
1544 pDst, // row 0, col 0
1545 pDst + DestRowWidthBytes, // row 1, col 0
1546 pDst + dx / 2, // row 0, col 1
1547 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1548 };
1549
1550 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1551 {
1552 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1553 {
1554 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1555
1556 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1557
1558 ppDsts[0] += dx;
1559 ppDsts[1] += dx;
1560 ppDsts[2] += dx;
1561 ppDsts[3] += dx;
1562 }
1563
1564 ppDsts[0] += dy;
1565 ppDsts[1] += dy;
1566 ppDsts[2] += dy;
1567 ppDsts[3] += dy;
1568 }
1569 }
1570 };
1571
1572 //////////////////////////////////////////////////////////////////////////
1573 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1574 //////////////////////////////////////////////////////////////////////////
1575 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1576 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1577 {
1578 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1579 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1580
1581 //////////////////////////////////////////////////////////////////////////
1582 /// @brief Stores an 8x8 raster tile to the destination surface.
1583 /// @param pSrc - Pointer to raster tile.
1584 /// @param pDstSurface - Destination surface state
1585 /// @param x, y - Coordinates to raster tile.
1586 INLINE static void Store(
1587 uint8_t *pSrc,
1588 SWR_SURFACE_STATE* pDstSurface,
1589 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1590 {
1591 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1592 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1593
1594 // Punt non-full tiles to generic store
1595 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1596 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1597
1598 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1599 {
1600 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1601 }
1602
1603 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1604 // We can compute the offsets to each column within the raster tile once and increment from these.
1605 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1606 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1607 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1608
1609 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1610 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1611
1612 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1613 uint8_t *ppDsts[] =
1614 {
1615 pDst, // row 0, col 0
1616 pDst + DestRowWidthBytes, // row 1, col 0
1617 pDst + DestColumnBytes, // row 0, col 1
1618 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
1619 };
1620
1621 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1622 {
1623 // Raster tile width is same as simd16 tile width
1624 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1625
1626 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1627
1628 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1629
1630 ppDsts[0] += dy;
1631 ppDsts[1] += dy;
1632 ppDsts[2] += dy;
1633 ppDsts[3] += dy;
1634 }
1635 }
1636 };
1637
1638 //////////////////////////////////////////////////////////////////////////
1639 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1640 //////////////////////////////////////////////////////////////////////////
1641 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1642 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
1643 {
1644 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
1645 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1646
1647 //////////////////////////////////////////////////////////////////////////
1648 /// @brief Stores an 8x8 raster tile to the destination surface.
1649 /// @param pSrc - Pointer to raster tile.
1650 /// @param pDstSurface - Destination surface state
1651 /// @param x, y - Coordinates to raster tile.
1652 INLINE static void Store(
1653 uint8_t *pSrc,
1654 SWR_SURFACE_STATE* pDstSurface,
1655 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1656 {
1657 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1658 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1659
1660 // Punt non-full tiles to generic store
1661 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1662 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1663
1664 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1665 {
1666 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1667 }
1668
1669 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1670 // We can compute the offsets to each column within the raster tile once and increment from these.
1671 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1672 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1673 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1674
1675 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1676 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1677
1678 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1679 uint8_t *ppDsts[] =
1680 {
1681 pDst, // row 0, col 0
1682 pDst + DestRowWidthBytes, // row 1, col 0
1683 pDst + DestColumnBytes, // row 0, col 1
1684 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
1685 pDst + DestColumnBytes * 2, // row 0, col 2
1686 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1687 pDst + DestColumnBytes * 3, // row 0, col 3
1688 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
1689 };
1690
1691 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1692 {
1693 // Raster tile width is same as simd16 tile width
1694 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1695
1696 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1697
1698 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1699
1700 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1701 {
1702 ppDsts[i] += dy;
1703 }
1704 }
1705 }
1706 };
1707
1708 //////////////////////////////////////////////////////////////////////////
1709 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1710 //////////////////////////////////////////////////////////////////////////
1711 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1712 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
1713 {
1714 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
1715 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1716
1717 //////////////////////////////////////////////////////////////////////////
1718 /// @brief Stores an 8x8 raster tile to the destination surface.
1719 /// @param pSrc - Pointer to raster tile.
1720 /// @param pDstSurface - Destination surface state
1721 /// @param x, y - Coordinates to raster tile.
1722 INLINE static void Store(
1723 uint8_t *pSrc,
1724 SWR_SURFACE_STATE* pDstSurface,
1725 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1726 {
1727 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1728 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1729
1730 // Punt non-full tiles to generic store
1731 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1732 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1733
1734 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1735 {
1736 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1737 }
1738
1739 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1740 // We can compute the offsets to each column within the raster tile once and increment from these.
1741 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1742 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1743 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1744
1745 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1746 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1747
1748 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1749 uint8_t *ppDsts[] =
1750 {
1751 pDst, // row 0, col 0
1752 pDst + DestRowWidthBytes, // row 1, col 0
1753 pDst + DestColumnBytes, // row 0, col 1
1754 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
1755 pDst + DestColumnBytes * 2, // row 0, col 2
1756 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1757 pDst + DestColumnBytes * 3, // row 0, col 3
1758 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
1759 pDst + DestColumnBytes * 4, // row 0, col 4
1760 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
1761 pDst + DestColumnBytes * 5, // row 0, col 5
1762 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
1763 pDst + DestColumnBytes * 6, // row 0, col 6
1764 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
1765 pDst + DestColumnBytes * 7, // row 0, col 7
1766 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
1767 };
1768
1769 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1770 {
1771 // Raster tile width is same as simd16 tile width
1772 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1773
1774 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1775
1776 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1777
1778 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1779 {
1780 ppDsts[i] += dy;
1781 }
1782 }
1783 }
1784 };
1785
1786 //////////////////////////////////////////////////////////////////////////
1787 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1788 //////////////////////////////////////////////////////////////////////////
1789 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1790 struct StoreMacroTile
1791 {
1792 //////////////////////////////////////////////////////////////////////////
1793 /// @brief Stores a macrotile to the destination surface using safe implementation.
1794 /// @param pSrc - Pointer to macro tile.
1795 /// @param pDstSurface - Destination surface state
1796 /// @param x, y - Coordinates to macro tile
1797 static void StoreGeneric(
1798 uint8_t *pSrcHotTile,
1799 SWR_SURFACE_STATE* pDstSurface,
1800 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1801 {
1802 PFN_STORE_TILES_INTERNAL pfnStore;
1803 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1804
1805 // Store each raster tile from the hot tile to the destination surface.
1806 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1807 {
1808 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1809 {
1810 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1811 {
1812 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1813 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1814 }
1815 }
1816 }
1817
1818 }
1819
1820 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
1821 //////////////////////////////////////////////////////////////////////////
1822 /// @brief Stores a macrotile to the destination surface.
1823 /// @param pSrc - Pointer to macro tile.
1824 /// @param pDstSurface - Destination surface state
1825 /// @param x, y - Coordinates to macro tile
1826 static void Store(
1827 uint8_t *pSrcHotTile,
1828 SWR_SURFACE_STATE* pDstSurface,
1829 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1830 {
1831 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
1832
1833 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1834 {
1835 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
1836 0,
1837 0,
1838 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
1839 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
1840 sampleNum,
1841 pDstSurface->lod,
1842 pDstSurface);
1843
1844 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1845 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
1846 (pDstSurface->bInterleavedSamples);
1847
1848 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1849 }
1850
1851 // Save original for pSrcHotTile resolve.
1852 uint8_t *pResolveSrcHotTile = pSrcHotTile;
1853
1854 // Store each raster tile from the hot tile to the destination surface.
1855 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1856 {
1857 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1858 {
1859 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1860 {
1861 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1862 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1863 }
1864 }
1865 }
1866
1867 if (pDstSurface->xpAuxBaseAddress)
1868 {
1869 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1870 // Store each raster tile from the hot tile to the destination surface.
1871 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1872 {
1873 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1874 {
1875 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
1876 pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
1877 }
1878 }
1879 }
1880 }
1881 };
1882
1883 //////////////////////////////////////////////////////////////////////////
1884 /// InitStoreTilesTable - Helper for setting up the tables.
1885 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1886 void InitStoreTilesTableColor_Half1(
1887 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
1888 {
1889 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
1890 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
1891 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
1892 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
1893 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
1894 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
1895 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
1896 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
1897 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
1898 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
1899 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
1900 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
1901 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
1902 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
1903 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
1904 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
1905 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
1906 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
1907 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
1908 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
1909 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
1910 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
1911 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
1912 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
1913 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
1914 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
1915 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
1916 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
1917 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
1918 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
1919 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
1920 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
1921 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
1922 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
1923 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
1924 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
1925 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
1926 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
1927 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
1928 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
1929 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
1930 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
1931 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
1932 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
1933 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
1934 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
1935 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
1936 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
1937 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
1938 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
1939 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
1940 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
1941 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
1942 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
1943 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
1944 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
1945 }
1946
1947 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1948 void InitStoreTilesTableColor_Half2(
1949 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
1950 {
1951 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
1952 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
1953 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
1954 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
1955 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
1956 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
1957 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
1958 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
1959 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
1960 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
1961 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
1962 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
1963 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
1964 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
1965 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
1966 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
1967 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
1968 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
1969 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
1970 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
1971 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
1972 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
1973 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
1974 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
1975 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
1976 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
1977 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
1978 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
1979 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
1980 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
1981 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
1982 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
1983 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
1984 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
1985 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
1986 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
1987 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
1988 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
1989 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
1990 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
1991 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
1992 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
1993 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
1994 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
1995 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
1996 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
1997 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
1998 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
1999 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2000 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2001 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2002 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2003 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2004 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2005 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2006 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2007 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2008 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2009 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2010 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2011 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2012 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2013 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2014 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2015 }
2016
2017 //////////////////////////////////////////////////////////////////////////
2018 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2019 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2020 void InitStoreTilesTableDepth(
2021 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2022 {
2023 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2024 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2025 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2026 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2027 }
2028
2029 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2030 void InitStoreTilesTableStencil(
2031 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2032 {
2033 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2034 }
2035
2036
2037 //////////////////////////////////////////////////////////////////////////
2038 /// @brief Deswizzles and stores a full hottile to a render surface
2039 /// @param hPrivateContext - Handle to private DC
2040 /// @param srcFormat - Format for hot tile.
2041 /// @param renderTargetIndex - Index to destination render target
2042 /// @param x, y - Coordinates to raster tile.
2043 /// @param pSrcHotTile - Pointer to Hot Tile
2044 void SwrStoreHotTileToSurface(
2045 HANDLE hWorkerPrivateData,
2046 SWR_SURFACE_STATE *pDstSurface,
2047 SWR_FORMAT srcFormat,
2048 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
2049 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
2050 uint8_t *pSrcHotTile);