swr: use ARRAY_SIZE macro
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39
40 #include <array>
41 #include <sstream>
42
43 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
44
45 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
46 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// Store Raster Tile Function Tables.
50 //////////////////////////////////////////////////////////////////////////
51 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
54
55 void InitStoreTilesTable_Linear_1();
56 void InitStoreTilesTable_Linear_2();
57 void InitStoreTilesTable_TileX_1();
58 void InitStoreTilesTable_TileX_2();
59 void InitStoreTilesTable_TileY_1();
60 void InitStoreTilesTable_TileY_2();
61 void InitStoreTilesTable_TileW();
62 void InitStoreTilesTable();
63
64 //////////////////////////////////////////////////////////////////////////
65 /// StorePixels
66 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
67 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
68 /// @param ppDsts - Array of destination pointers. Each pointer is
69 /// to a single row of at most 16B.
70 /// @tparam NumDests - Number of destination pointers. Each pair of
71 /// pointers is for a 16-byte column of two rows.
72 //////////////////////////////////////////////////////////////////////////
73 template <size_t PixelSize, size_t NumDests>
74 struct StorePixels
75 {
76 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
77 };
78
79 //////////////////////////////////////////////////////////////////////////
80 /// StorePixels (32-bit pixel specialization)
81 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
82 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
83 /// @param ppDsts - Array of destination pointers. Each pointer is
84 /// to a single row of at most 16B.
85 /// @tparam NumDests - Number of destination pointers. Each pair of
86 /// pointers is for a 16-byte column of two rows.
87 //////////////////////////////////////////////////////////////////////////
88 template <>
89 struct StorePixels<8, 2>
90 {
91 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
92 {
93 // Each 4-pixel row is 4 bytes.
94 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
95
96 // Unswizzle from SWR-Z order
97 uint16_t* pRow = (uint16_t*)ppDsts[0];
98 pRow[0] = pPixSrc[0];
99 pRow[1] = pPixSrc[2];
100
101 pRow = (uint16_t*)ppDsts[1];
102 pRow[0] = pPixSrc[1];
103 pRow[1] = pPixSrc[3];
104 }
105 };
106
107 #if USE_8x2_TILE_BACKEND
108 template <>
109 struct StorePixels<8, 4>
110 {
111 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
112 {
113 // 8 x 2 bytes = 16 bytes, 16 pixels
114 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
115
116 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
117
118 // Unswizzle from SWR-Z order
119 ppDsts16[0][0] = pSrc16[0]; // 0 1
120 ppDsts16[0][1] = pSrc16[2]; // 4 5
121
122 ppDsts16[1][0] = pSrc16[1]; // 2 3
123 ppDsts16[1][1] = pSrc16[3]; // 6 7
124
125 ppDsts16[2][0] = pSrc16[4]; // 8 9
126 ppDsts16[2][1] = pSrc16[6]; // C D
127
128 ppDsts16[3][0] = pSrc16[5]; // A B
129 ppDsts16[3][1] = pSrc16[7]; // E F
130 }
131 };
132
133 #endif
134 //////////////////////////////////////////////////////////////////////////
135 /// StorePixels (32-bit pixel specialization)
136 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
137 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
138 /// @param ppDsts - Array of destination pointers. Each pointer is
139 /// to a single row of at most 16B.
140 /// @tparam NumDests - Number of destination pointers. Each pair of
141 /// pointers is for a 16-byte column of two rows.
142 //////////////////////////////////////////////////////////////////////////
143 template <>
144 struct StorePixels<16, 2>
145 {
146 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
147 {
148 // Each 4-pixel row is 8 bytes.
149 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
150
151 // Unswizzle from SWR-Z order
152 uint32_t* pRow = (uint32_t*)ppDsts[0];
153 pRow[0] = pPixSrc[0];
154 pRow[1] = pPixSrc[2];
155
156 pRow = (uint32_t*)ppDsts[1];
157 pRow[0] = pPixSrc[1];
158 pRow[1] = pPixSrc[3];
159 }
160 };
161
162 #if USE_8x2_TILE_BACKEND
163 template <>
164 struct StorePixels<16, 4>
165 {
166 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
167 {
168 // 8 x 4 bytes = 32 bytes, 16 pixels
169 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
170
171 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
172
173 // Unswizzle from SWR-Z order
174 ppDsts32[0][0] = pSrc32[0]; // 0 1
175 ppDsts32[0][1] = pSrc32[2]; // 4 5
176
177 ppDsts32[1][0] = pSrc32[1]; // 2 3
178 ppDsts32[1][1] = pSrc32[3]; // 6 7
179
180 ppDsts32[2][0] = pSrc32[4]; // 8 9
181 ppDsts32[2][1] = pSrc32[6]; // C D
182
183 ppDsts32[3][0] = pSrc32[5]; // A B
184 ppDsts32[3][1] = pSrc32[7]; // E F
185 }
186 };
187
188 #endif
189 //////////////////////////////////////////////////////////////////////////
190 /// StorePixels (32-bit pixel specialization)
191 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
192 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
193 /// @param ppDsts - Array of destination pointers. Each pointer is
194 /// to a single row of at most 16B.
195 /// @tparam NumDests - Number of destination pointers. Each pair of
196 /// pointers is for a 16-byte column of two rows.
197 //////////////////////////////////////////////////////////////////////////
198 template <>
199 struct StorePixels<32, 2>
200 {
201 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
202 {
203 // Each 4-pixel row is 16-bytes
204 simd4scalari *pZRow01 = (simd4scalari*)pSrc;
205 simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
206 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
207
208 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
209 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
210
211 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
212 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
213 }
214 };
215
216 #if USE_8x2_TILE_BACKEND
217 template <>
218 struct StorePixels<32, 4>
219 {
220 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
221 {
222 // 4 x 16 bytes = 64 bytes, 16 pixels
223 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
224
225 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
226
227 // Unswizzle from SWR-Z order
228 simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
229 simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
230 simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
231 simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
232
233 SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
234 SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
235 SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
236 SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
237 }
238 };
239
240 #endif
241 //////////////////////////////////////////////////////////////////////////
242 /// StorePixels (32-bit pixel specialization)
243 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
244 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
245 /// @param ppDsts - Array of destination pointers. Each pointer is
246 /// to a single row of at most 16B.
247 /// @tparam NumDests - Number of destination pointers. Each pair of
248 /// pointers is for a 16-byte column of two rows.
249 //////////////////////////////////////////////////////////////////////////
250 template <>
251 struct StorePixels<64, 4>
252 {
253 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
254 {
255 // Each 4-pixel row is 32 bytes.
256 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
257
258 // order of pointers match SWR-Z layout
259 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
260 *pvDsts[0] = pPixSrc[0];
261 *pvDsts[1] = pPixSrc[1];
262 *pvDsts[2] = pPixSrc[2];
263 *pvDsts[3] = pPixSrc[3];
264 }
265 };
266
267 #if USE_8x2_TILE_BACKEND
268 template <>
269 struct StorePixels<64, 8>
270 {
271 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
272 {
273 // 8 x 16 bytes = 128 bytes, 16 pixels
274 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
275
276 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
277
278 // order of pointers match SWR-Z layout
279 *ppDsts128[0] = pSrc128[0]; // 0 1
280 *ppDsts128[1] = pSrc128[1]; // 2 3
281 *ppDsts128[2] = pSrc128[2]; // 4 5
282 *ppDsts128[3] = pSrc128[3]; // 6 7
283 *ppDsts128[4] = pSrc128[4]; // 8 9
284 *ppDsts128[5] = pSrc128[5]; // A B
285 *ppDsts128[6] = pSrc128[6]; // C D
286 *ppDsts128[7] = pSrc128[7]; // E F
287 }
288 };
289
290 #endif
291 //////////////////////////////////////////////////////////////////////////
292 /// StorePixels (32-bit pixel specialization)
293 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
294 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
295 /// @param ppDsts - Array of destination pointers. Each pointer is
296 /// to a single row of at most 16B.
297 /// @tparam NumDests - Number of destination pointers. Each pair of
298 /// pointers is for a 16-byte column of two rows.
299 //////////////////////////////////////////////////////////////////////////
300 template <>
301 struct StorePixels<128, 8>
302 {
303 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
304 {
305 // Each 4-pixel row is 64 bytes.
306 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
307
308 // Unswizzle from SWR-Z order
309 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
310 *pvDsts[0] = pPixSrc[0];
311 *pvDsts[1] = pPixSrc[2];
312 *pvDsts[2] = pPixSrc[1];
313 *pvDsts[3] = pPixSrc[3];
314 *pvDsts[4] = pPixSrc[4];
315 *pvDsts[5] = pPixSrc[6];
316 *pvDsts[6] = pPixSrc[5];
317 *pvDsts[7] = pPixSrc[7];
318 }
319 };
320
321 #if USE_8x2_TILE_BACKEND
322 template <>
323 struct StorePixels<128, 16>
324 {
325 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
326 {
327 // 16 x 16 bytes = 256 bytes, 16 pixels
328 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
329
330 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
331
332 for (uint32_t i = 0; i < 16; i += 4)
333 {
334 *ppDsts128[i + 0] = pSrc128[i + 0];
335 *ppDsts128[i + 1] = pSrc128[i + 2];
336 *ppDsts128[i + 2] = pSrc128[i + 1];
337 *ppDsts128[i + 3] = pSrc128[i + 3];
338 }
339 }
340 };
341
342 #endif
343 //////////////////////////////////////////////////////////////////////////
344 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
345 //////////////////////////////////////////////////////////////////////////
346 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
347 struct ConvertPixelsSOAtoAOS
348 {
349 //////////////////////////////////////////////////////////////////////////
350 /// @brief Converts a SIMD from the Hot Tile to the destination format
351 /// and converts from SOA to AOS.
352 /// @param pSrc - Pointer to raster tile.
353 /// @param pDst - Pointer to destination surface or deswizzling buffer.
354 template <size_t NumDests>
355 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
356 {
357 #if USE_8x2_TILE_BACKEND
358 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
359
360 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
361 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
362
363 // Convert from SrcFormat --> DstFormat
364 simd16vector src;
365 LoadSOA<SrcFormat>(pSrc, src);
366 StoreSOA<DstFormat>(src, soaTile);
367
368 // Convert from SOA --> AOS
369 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
370
371 #else
372 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
373
374 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
375 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
376
377 // Convert from SrcFormat --> DstFormat
378 simdvector src;
379 LoadSOA<SrcFormat>(pSrc, src);
380 StoreSOA<DstFormat>(src, soaTile);
381
382 // Convert from SOA --> AOS
383 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
384
385 #endif
386 // Store data into destination
387 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
388 }
389 };
390
391 //////////////////////////////////////////////////////////////////////////
392 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
393 /// Specialization for no format conversion
394 //////////////////////////////////////////////////////////////////////////
395 template<SWR_FORMAT Format>
396 struct ConvertPixelsSOAtoAOS<Format, Format>
397 {
398 //////////////////////////////////////////////////////////////////////////
399 /// @brief Converts a SIMD from the Hot Tile to the destination format
400 /// and converts from SOA to AOS.
401 /// @param pSrc - Pointer to raster tile.
402 /// @param pDst - Pointer to destination surface or deswizzling buffer.
403 template <size_t NumDests>
404 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
405 {
406 #if USE_8x2_TILE_BACKEND
407 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
408
409 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
410
411 // Convert from SOA --> AOS
412 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
413
414 #else
415 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
416
417 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
418
419 // Convert from SOA --> AOS
420 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
421
422 #endif
423 // Store data into destination
424 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
425 }
426 };
427
428 //////////////////////////////////////////////////////////////////////////
429 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
430 //////////////////////////////////////////////////////////////////////////
431 template<>
432 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
433 {
434 //////////////////////////////////////////////////////////////////////////
435 /// @brief Converts a SIMD from the Hot Tile to the destination format
436 /// and converts from SOA to AOS.
437 /// @param pSrc - Pointer to raster tile.
438 /// @param pDst - Pointer to destination surface or deswizzling buffer.
439 template <size_t NumDests>
440 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
441 {
442 #if USE_8x2_TILE_BACKEND
443 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
444 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
445
446 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
447
448 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
449
450 // Load hot-tile
451 simd16vector src, dst;
452 LoadSOA<SrcFormat>(pSrc, src);
453
454 // deswizzle
455 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
456 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
457 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
458
459 // clamp
460 dst.x = Clamp<DstFormat>(dst.x, 0);
461 dst.y = Clamp<DstFormat>(dst.y, 1);
462 dst.z = Clamp<DstFormat>(dst.z, 2);
463
464 // normalize
465 dst.x = Normalize<DstFormat>(dst.x, 0);
466 dst.y = Normalize<DstFormat>(dst.y, 1);
467 dst.z = Normalize<DstFormat>(dst.z, 2);
468
469 // pack
470 simd16scalari packed = _simd16_castps_si(dst.x);
471
472 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
473 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
474
475 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
476 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
477
478 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
479 uint32_t *pPacked = (uint32_t*)&packed;
480 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
481 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
482 {
483 *pAosTile++ = *pPacked++;
484 }
485
486 #else
487 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
488 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
489 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
490
491 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
492
493 // Load hot-tile
494 simdvector src, dst;
495 LoadSOA<SrcFormat>(pSrc, src);
496
497 // deswizzle
498 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
499 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
500 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
501
502 // clamp
503 dst.x = Clamp<DstFormat>(dst.x, 0);
504 dst.y = Clamp<DstFormat>(dst.y, 1);
505 dst.z = Clamp<DstFormat>(dst.z, 2);
506
507 // normalize
508 dst.x = Normalize<DstFormat>(dst.x, 0);
509 dst.y = Normalize<DstFormat>(dst.y, 1);
510 dst.z = Normalize<DstFormat>(dst.z, 2);
511
512 // pack
513 simdscalari packed = _simd_castps_si(dst.x);
514 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
515 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
516 FormatTraits<DstFormat>::GetConstBPC(1)));
517
518 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
519 uint32_t *pPacked = (uint32_t*)&packed;
520 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
521 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
522 {
523 *pAosTile++ = *pPacked++;
524 }
525
526 #endif
527 // Store data into destination
528 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
529 }
530 };
531
532 //////////////////////////////////////////////////////////////////////////
533 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
534 //////////////////////////////////////////////////////////////////////////
535 template<>
536 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
537 {
538 static const SWR_FORMAT SrcFormat = R32_FLOAT;
539 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
540
541 //////////////////////////////////////////////////////////////////////////
542 /// @brief Converts a SIMD from the Hot Tile to the destination format
543 /// and converts from SOA to AOS.
544 /// @param pSrc - Pointer to raster tile.
545 /// @param pDst - Pointer to destination surface or deswizzling buffer.
546 template <size_t NumDests>
547 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
548 {
549 #if USE_8x2_TILE_BACKEND
550 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
551
552 // clamp
553 const simd16scalar zero = _simd16_setzero_ps();
554 const simd16scalar ones = _simd16_set1_ps(1.0f);
555
556 comp = _simd16_max_ps(comp, zero);
557 comp = _simd16_min_ps(comp, ones);
558
559 // normalize
560 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
561
562 simd16scalari temp = _simd16_cvtps_epi32(comp);
563
564 // swizzle
565 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
566
567 // merge/store data into destination but don't overwrite the X8 bits
568 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
569 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
570
571 simd16scalari dest = _simd16_setzero_si();
572
573 dest = _simd16_insert_si(dest, destlo, 0);
574 dest = _simd16_insert_si(dest, desthi, 1);
575
576 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
577
578 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
579
580 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
581 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
582 #else
583 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
584
585 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
586 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
587
588 // Convert from SrcFormat --> DstFormat
589 simdvector src;
590 LoadSOA<SrcFormat>(pSrc, src);
591 StoreSOA<DstFormat>(src, soaTile);
592
593 // Convert from SOA --> AOS
594 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
595
596 // Store data into destination but don't overwrite the X8 bits
597 // Each 4-pixel row is 16-bytes
598 simd4scalari *pZRow01 = (simd4scalari*)aosTile;
599 simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
600 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
601
602 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
603 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
604
605 simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
606 simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
607
608 simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
609
610 vDst0 = SIMD128::andnot_si(vMask, vDst0);
611 vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
612 vDst1 = SIMD128::andnot_si(vMask, vDst1);
613 vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
614
615 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
616 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
617 #endif
618 }
619 };
620
621 #if USE_8x2_TILE_BACKEND
622 template<SWR_FORMAT DstFormat>
623 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
624 {
625 // swizzle rgba -> bgra while we load
626 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
627 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
628 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
629 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
630
631 // clamp
632 const simd16scalar zero = _simd16_setzero_ps();
633 const simd16scalar ones = _simd16_set1_ps(1.0f);
634
635 comp0 = _simd16_max_ps(comp0, zero);
636 comp0 = _simd16_min_ps(comp0, ones);
637
638 comp1 = _simd16_max_ps(comp1, zero);
639 comp1 = _simd16_min_ps(comp1, ones);
640
641 comp2 = _simd16_max_ps(comp2, zero);
642 comp2 = _simd16_min_ps(comp2, ones);
643
644 comp3 = _simd16_max_ps(comp3, zero);
645 comp3 = _simd16_min_ps(comp3, ones);
646
647 // gamma-correct only rgb
648 if (FormatTraits<DstFormat>::isSRGB)
649 {
650 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
651 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
652 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
653 }
654
655 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
656 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
657 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
658 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
659 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
660
661 // moving to 16 wide integer vector types
662 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
663 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
664 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
665 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
666
667 // SOA to AOS conversion
668 src1 = _simd16_slli_epi32(src1, 8);
669 src2 = _simd16_slli_epi32(src2, 16);
670 src3 = _simd16_slli_epi32(src3, 24);
671
672 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
673
674 // de-swizzle conversion
675 #if 1
676 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
677 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
678
679 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
680
681 #else
682 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
683
684 #endif
685 // store 8x2 memory order:
686 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
687 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
688 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
689 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
690 }
691
692 #endif
693 template<SWR_FORMAT DstFormat>
694 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
695 {
696 static const uint32_t offset = sizeof(simdscalar);
697
698 // swizzle rgba -> bgra while we load
699 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
700 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
701 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
702 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
703
704 // clamp
705 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
706 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
707
708 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
709 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
710
711 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
712 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
713
714 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
715 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
716
717 if (FormatTraits<DstFormat>::isSRGB)
718 {
719 // Gamma-correct only rgb
720 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
721 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
722 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
723 }
724
725 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
726 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
727 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
728 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
729 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
730
731 // moving to 8 wide integer vector types
732 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
733 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
734 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
735 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
736
737 #if KNOB_ARCH <= KNOB_ARCH_AVX
738
739 // splitting into two sets of 4 wide integer vector types
740 // because AVX doesn't have instructions to support this operation at 8 wide
741 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
742 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
743 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
744 simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
745
746 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
747 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
748 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
749 simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
750
751 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
752 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
753 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
754 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
755 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
756 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
757
758 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
759 srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
760
761 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
762 srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
763
764 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
765 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
766
767 // unpack into rows that get the tiling order correct
768 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
769 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
770
771 simdscalari final = _mm256_castsi128_si256(vRow00);
772 final = _mm256_insertf128_si256(final, vRow10, 1);
773
774 #else
775
776 // logic is as above, only wider
777 src1 = _mm256_slli_si256(src1, 1);
778 src2 = _mm256_slli_si256(src2, 2);
779 src3 = _mm256_slli_si256(src3, 3);
780
781 src0 = _mm256_or_si256(src0, src1);
782 src2 = _mm256_or_si256(src2, src3);
783
784 simdscalari final = _mm256_or_si256(src0, src2);
785
786 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
787 final = _mm256_permute4x64_epi64(final, 0xD8);
788 #endif
789
790 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
791 }
792
793 #if USE_8x2_TILE_BACKEND
794 template<SWR_FORMAT DstFormat>
795 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
796 {
797 // swizzle rgba -> bgra while we load
798 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
799 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
800 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
801
802 // clamp
803 const simd16scalar zero = _simd16_setzero_ps();
804 const simd16scalar ones = _simd16_set1_ps(1.0f);
805
806 comp0 = _simd16_max_ps(comp0, zero);
807 comp0 = _simd16_min_ps(comp0, ones);
808
809 comp1 = _simd16_max_ps(comp1, zero);
810 comp1 = _simd16_min_ps(comp1, ones);
811
812 comp2 = _simd16_max_ps(comp2, zero);
813 comp2 = _simd16_min_ps(comp2, ones);
814
815 // gamma-correct only rgb
816 if (FormatTraits<DstFormat>::isSRGB)
817 {
818 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
819 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
820 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
821 }
822
823 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
824 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
825 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
826 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
827
828 // moving to 16 wide integer vector types
829 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
830 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
831 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
832
833 // SOA to AOS conversion
834 src1 = _simd16_slli_epi32(src1, 8);
835 src2 = _simd16_slli_epi32(src2, 16);
836
837 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
838
839 // de-swizzle conversion
840 #if 1
841 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
842 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
843
844 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
845
846 #else
847 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
848
849 #endif
850 // store 8x2 memory order:
851 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
852 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
853 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
854 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
855 }
856
857 #endif
858 template<SWR_FORMAT DstFormat>
859 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
860 {
861 static const uint32_t offset = sizeof(simdscalar);
862
863 // swizzle rgba -> bgra while we load
864 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
865 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
866 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
867 // clamp
868 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
869 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
870
871 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
872 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
873
874 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
875 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
876
877 if (FormatTraits<DstFormat>::isSRGB)
878 {
879 // Gamma-correct only rgb
880 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
881 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
882 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
883 }
884
885 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
886 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
887 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
888 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
889
890 // moving to 8 wide integer vector types
891 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
892 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
893 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
894
895 #if KNOB_ARCH <= KNOB_ARCH_AVX
896
897 // splitting into two sets of 4 wide integer vector types
898 // because AVX doesn't have instructions to support this operation at 8 wide
899 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
900 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
901 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
902
903 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
904 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
905 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
906
907 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
908 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
909 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
910 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
911
912 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
913
914 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
915
916 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
917 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
918
919 // unpack into rows that get the tiling order correct
920 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
921 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
922
923 simdscalari final = _mm256_castsi128_si256(vRow00);
924 final = _mm256_insertf128_si256(final, vRow10, 1);
925
926 #else
927
928 // logic is as above, only wider
929 src1 = _mm256_slli_si256(src1, 1);
930 src2 = _mm256_slli_si256(src2, 2);
931
932 src0 = _mm256_or_si256(src0, src1);
933
934 simdscalari final = _mm256_or_si256(src0, src2);
935
936 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
937 final = _mm256_permute4x64_epi64(final, 0xD8);
938
939 #endif
940
941 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
942 }
943
944 template<>
945 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
946 {
947 template <size_t NumDests>
948 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
949 {
950 #if USE_8x2_TILE_BACKEND
951 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
952 #else
953 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
954 #endif
955 }
956 };
957
958 template<>
959 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
960 {
961 template <size_t NumDests>
962 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
963 {
964 #if USE_8x2_TILE_BACKEND
965 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
966 #else
967 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
968 #endif
969 }
970 };
971
972 template<>
973 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
974 {
975 template <size_t NumDests>
976 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
977 {
978 #if USE_8x2_TILE_BACKEND
979 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
980 #else
981 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
982 #endif
983 }
984 };
985
986 template<>
987 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
988 {
989 template <size_t NumDests>
990 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
991 {
992 #if USE_8x2_TILE_BACKEND
993 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
994 #else
995 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
996 #endif
997 }
998 };
999
1000 template<>
1001 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1002 {
1003 template <size_t NumDests>
1004 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1005 {
1006 #if USE_8x2_TILE_BACKEND
1007 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1008 #else
1009 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1010 #endif
1011 }
1012 };
1013
1014 template<>
1015 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1016 {
1017 template <size_t NumDests>
1018 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1019 {
1020 #if USE_8x2_TILE_BACKEND
1021 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1022 #else
1023 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1024 #endif
1025 }
1026 };
1027
1028 template<>
1029 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1030 {
1031 template <size_t NumDests>
1032 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1033 {
1034 #if USE_8x2_TILE_BACKEND
1035 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1036 #else
1037 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1038 #endif
1039 }
1040 };
1041
1042 template<>
1043 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1044 {
1045 template <size_t NumDests>
1046 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1047 {
1048 #if USE_8x2_TILE_BACKEND
1049 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1050 #else
1051 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1052 #endif
1053 }
1054 };
1055
1056 //////////////////////////////////////////////////////////////////////////
1057 /// StoreRasterTile
1058 //////////////////////////////////////////////////////////////////////////
1059 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1060 struct StoreRasterTile
1061 {
1062 //////////////////////////////////////////////////////////////////////////
1063 /// @brief Retrieve color from hot tile source which is always float.
1064 /// @param pSrc - Pointer to raster tile.
1065 /// @param x, y - Coordinates to raster tile.
1066 /// @param output - output color
1067 INLINE static void GetSwizzledSrcColor(
1068 uint8_t* pSrc,
1069 uint32_t x, uint32_t y,
1070 float outputColor[4])
1071 {
1072 #if USE_8x2_TILE_BACKEND
1073 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1074
1075 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1076
1077 // Compute which simd tile we're accessing within 8x8 tile.
1078 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1079 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1080
1081 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1082
1083 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1084
1085 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1086 #else
1087 typedef SimdTile<SrcFormat, DstFormat> SimdT;
1088
1089 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1090
1091 // Compute which simd tile we're accessing within 8x8 tile.
1092 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1093 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1094
1095 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1096
1097 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1098
1099 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1100 #endif
1101 }
1102
1103 //////////////////////////////////////////////////////////////////////////
1104 /// @brief Stores an 8x8 raster tile to the destination surface.
1105 /// @param pSrc - Pointer to raster tile.
1106 /// @param pDstSurface - Destination surface state
1107 /// @param x, y - Coordinates to raster tile.
1108 INLINE static void Store(
1109 uint8_t *pSrc,
1110 SWR_SURFACE_STATE* pDstSurface,
1111 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1112 {
1113 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1114 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1115
1116 // For each raster tile pixel (rx, ry)
1117 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1118 {
1119 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1120 {
1121 // Perform bounds checking.
1122 if (((x + rx) < lodWidth) &&
1123 ((y + ry) < lodHeight))
1124 {
1125 float srcColor[4];
1126 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1127
1128 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1129 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1130 sampleNum, pDstSurface->lod, pDstSurface);
1131 {
1132 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1133 }
1134 }
1135 }
1136 }
1137 }
1138
1139 //////////////////////////////////////////////////////////////////////////
1140 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
1141 /// @param pSrc - Pointer to raster tile.
1142 /// @param pDstSurface - Destination surface state
1143 /// @param x, y - Coordinates to raster tile.
1144 /// @param sampleOffset - Offset between adjacent multisamples
1145 INLINE static void Resolve(
1146 uint8_t *pSrc,
1147 SWR_SURFACE_STATE* pDstSurface,
1148 uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1149 {
1150 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1151 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1152
1153 float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
1154
1155 // For each raster tile pixel (rx, ry)
1156 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1157 {
1158 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1159 {
1160 // Perform bounds checking.
1161 if (((x + rx) < lodWidth) &&
1162 ((y + ry) < lodHeight))
1163 {
1164 // Sum across samples
1165 float resolveColor[4] = {0};
1166 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1167 {
1168 float sampleColor[4] = {0};
1169 uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1170 GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1171 resolveColor[0] += sampleColor[0];
1172 resolveColor[1] += sampleColor[1];
1173 resolveColor[2] += sampleColor[2];
1174 resolveColor[3] += sampleColor[3];
1175 }
1176
1177 // Divide by numSamples to average
1178 resolveColor[0] *= oneOverNumSamples;
1179 resolveColor[1] *= oneOverNumSamples;
1180 resolveColor[2] *= oneOverNumSamples;
1181 resolveColor[3] *= oneOverNumSamples;
1182
1183 // Use the resolve surface state
1184 SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1185 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1186 pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1187 0, pResolveSurface->lod, pResolveSurface);
1188 {
1189 ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1190 }
1191 }
1192 }
1193 }
1194 }
1195
1196 };
1197
1198 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1199 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1200 {};
1201
1202 //////////////////////////////////////////////////////////////////////////
1203 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1204 //////////////////////////////////////////////////////////////////////////
1205 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1206 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1207 {
1208 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1209 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1210 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1211
1212 //////////////////////////////////////////////////////////////////////////
1213 /// @brief Stores an 8x8 raster tile to the destination surface.
1214 /// @param pSrc - Pointer to raster tile.
1215 /// @param pDstSurface - Destination surface state
1216 /// @param x, y - Coordinates to raster tile.
1217 INLINE static void Store(
1218 uint8_t *pSrc,
1219 SWR_SURFACE_STATE* pDstSurface,
1220 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1221 {
1222 // Punt non-full tiles to generic store
1223 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1224 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1225
1226 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1227 {
1228 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1229 }
1230
1231 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1232 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1233 #if USE_8x2_TILE_BACKEND
1234
1235 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1236 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1237
1238 uint8_t* ppDsts[] =
1239 {
1240 pDst, // row 0, col 0
1241 pDst + pDstSurface->pitch, // row 1, col 0
1242 pDst + dx / 2, // row 0, col 1
1243 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1244 };
1245
1246 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1247 {
1248 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1249 {
1250 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1251
1252 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1253
1254 ppDsts[0] += dx;
1255 ppDsts[1] += dx;
1256 ppDsts[2] += dx;
1257 ppDsts[3] += dx;
1258 }
1259
1260 ppDsts[0] += dy;
1261 ppDsts[1] += dy;
1262 ppDsts[2] += dy;
1263 ppDsts[3] += dy;
1264 }
1265 #else
1266 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1267
1268 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1269 {
1270 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1271
1272 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1273 {
1274 // Format conversion and convert from SOA to AOS, and store the rows.
1275 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1276
1277 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1278 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1279 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1280 }
1281
1282 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1283 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1284 }
1285 #endif
1286 }
1287 };
1288
1289 //////////////////////////////////////////////////////////////////////////
1290 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1291 //////////////////////////////////////////////////////////////////////////
1292 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1293 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1294 {
1295 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1296 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1297 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1298
1299 //////////////////////////////////////////////////////////////////////////
1300 /// @brief Stores an 8x8 raster tile to the destination surface.
1301 /// @param pSrc - Pointer to raster tile.
1302 /// @param pDstSurface - Destination surface state
1303 /// @param x, y - Coordinates to raster tile.
1304 INLINE static void Store(
1305 uint8_t *pSrc,
1306 SWR_SURFACE_STATE* pDstSurface,
1307 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1308 {
1309 // Punt non-full tiles to generic store
1310 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1311 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1312
1313 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1314 {
1315 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1316 }
1317
1318 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1319 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1320 #if USE_8x2_TILE_BACKEND
1321
1322 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1323 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1324
1325 uint8_t* ppDsts[] =
1326 {
1327 pDst, // row 0, col 0
1328 pDst + pDstSurface->pitch, // row 1, col 0
1329 pDst + dx / 2, // row 0, col 1
1330 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1331 };
1332
1333 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1334 {
1335 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1336 {
1337 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1338
1339 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1340
1341 ppDsts[0] += dx;
1342 ppDsts[1] += dx;
1343 ppDsts[2] += dx;
1344 ppDsts[3] += dx;
1345 }
1346
1347 ppDsts[0] += dy;
1348 ppDsts[1] += dy;
1349 ppDsts[2] += dy;
1350 ppDsts[3] += dy;
1351 }
1352 #else
1353 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1354
1355 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1356 {
1357 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1358
1359 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1360 {
1361 // Format conversion and convert from SOA to AOS, and store the rows.
1362 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1363
1364 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1365 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1366 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1367 }
1368
1369 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1370 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1371 }
1372 #endif
1373 }
1374 };
1375
1376 //////////////////////////////////////////////////////////////////////////
1377 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1378 //////////////////////////////////////////////////////////////////////////
1379 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1380 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1381 {
1382 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1383 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1384 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1385
1386 //////////////////////////////////////////////////////////////////////////
1387 /// @brief Stores an 8x8 raster tile to the destination surface.
1388 /// @param pSrc - Pointer to raster tile.
1389 /// @param pDstSurface - Destination surface state
1390 /// @param x, y - Coordinates to raster tile.
1391 INLINE static void Store(
1392 uint8_t *pSrc,
1393 SWR_SURFACE_STATE* pDstSurface,
1394 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1395 {
1396 // Punt non-full tiles to generic store
1397 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1398 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1399
1400 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1401 {
1402 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1403 }
1404
1405 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1406 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1407 #if USE_8x2_TILE_BACKEND
1408
1409 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1410 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1411
1412 uint8_t* ppDsts[] =
1413 {
1414 pDst, // row 0, col 0
1415 pDst + pDstSurface->pitch, // row 1, col 0
1416 pDst + dx / 2, // row 0, col 1
1417 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1418 };
1419
1420 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1421 {
1422 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1423 {
1424 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1425
1426 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1427
1428 ppDsts[0] += dx;
1429 ppDsts[1] += dx;
1430 ppDsts[2] += dx;
1431 ppDsts[3] += dx;
1432 }
1433
1434 ppDsts[0] += dy;
1435 ppDsts[1] += dy;
1436 ppDsts[2] += dy;
1437 ppDsts[3] += dy;
1438 }
1439 #else
1440 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1441
1442 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1443 {
1444 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1445
1446 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1447 {
1448 // Format conversion and convert from SOA to AOS, and store the rows.
1449 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1450
1451 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1452 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1453 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1454 }
1455
1456 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1457 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1458 }
1459 #endif
1460 }
1461 };
1462
1463 //////////////////////////////////////////////////////////////////////////
1464 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1465 //////////////////////////////////////////////////////////////////////////
1466 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1467 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1468 {
1469 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1470 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1471 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1472 static const size_t MAX_DST_COLUMN_BYTES = 16;
1473 #if !USE_8x2_TILE_BACKEND
1474 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1475 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1476 #endif
1477
1478 //////////////////////////////////////////////////////////////////////////
1479 /// @brief Stores an 8x8 raster tile to the destination surface.
1480 /// @param pSrc - Pointer to raster tile.
1481 /// @param pDstSurface - Destination surface state
1482 /// @param x, y - Coordinates to raster tile.
1483 INLINE static void Store(
1484 uint8_t *pSrc,
1485 SWR_SURFACE_STATE* pDstSurface,
1486 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1487 {
1488 // Punt non-full tiles to generic store
1489 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1490 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1491
1492 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1493 {
1494 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1495 }
1496
1497 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1498 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1499 #if USE_8x2_TILE_BACKEND
1500
1501 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1502 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1503
1504 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1505 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1506
1507 uint8_t *ppDsts[] =
1508 {
1509 pDst, // row 0, col 0
1510 pDst + pDstSurface->pitch, // row 1, col 0
1511 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1512 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1513 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1514 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1515 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1516 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1517 };
1518
1519 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1520 {
1521 // Raster tile width is same as simd16 tile width
1522 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1523
1524 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1525
1526 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1527
1528 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1529 {
1530 ppDsts[i] += dy;
1531 }
1532 }
1533 #else
1534 uint8_t* ppDsts[] =
1535 {
1536 pDst, // row 0, col 0
1537 pDst + pDstSurface->pitch, // row 1, col 0
1538 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1539 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1540 };
1541
1542 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1543 {
1544 uint8_t* ppStartRows[] =
1545 {
1546 ppDsts[0],
1547 ppDsts[1],
1548 ppDsts[2],
1549 ppDsts[3],
1550 };
1551
1552 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1553 {
1554 // Format conversion and convert from SOA to AOS, and store the rows.
1555 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1556
1557 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1558 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1559 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1560 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1561 pSrc += SRC_COLUMN_BYTES;
1562 }
1563
1564 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1565 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1566 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1567 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1568 }
1569 #endif
1570 }
1571 };
1572
1573 //////////////////////////////////////////////////////////////////////////
1574 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1575 //////////////////////////////////////////////////////////////////////////
1576 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1577 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1578 {
1579 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1580 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1581 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1582 static const size_t MAX_DST_COLUMN_BYTES = 16;
1583 #if !USE_8x2_TILE_BACKEND
1584 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1585 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1586 #endif
1587
1588 //////////////////////////////////////////////////////////////////////////
1589 /// @brief Stores an 8x8 raster tile to the destination surface.
1590 /// @param pSrc - Pointer to raster tile.
1591 /// @param pDstSurface - Destination surface state
1592 /// @param x, y - Coordinates to raster tile.
1593 INLINE static void Store(
1594 uint8_t *pSrc,
1595 SWR_SURFACE_STATE* pDstSurface,
1596 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1597 {
1598 // Punt non-full tiles to generic store
1599 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1600 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1601
1602 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1603 {
1604 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1605 }
1606
1607 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1608 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1609 #if USE_8x2_TILE_BACKEND
1610
1611 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1612 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1613
1614 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1615 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1616
1617 uint8_t* ppDsts[] =
1618 {
1619 pDst, // row 0, col 0
1620 pDst + pDstSurface->pitch, // row 1, col 0
1621 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1622 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1623 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1624 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1625 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1626 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1627 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1628 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1629 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1630 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1631 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1632 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1633 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1634 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1635 };
1636
1637 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1638 {
1639 // Raster tile width is same as simd16 tile width
1640 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1641
1642 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1643
1644 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1645
1646 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1647 {
1648 ppDsts[i] += dy;
1649 }
1650 }
1651 #else
1652 struct DstPtrs
1653 {
1654 uint8_t* ppDsts[8];
1655 } ptrs;
1656
1657 // Need 8 pointers, 4 columns of 2 rows each
1658 for (uint32_t y = 0; y < 2; ++y)
1659 {
1660 for (uint32_t x = 0; x < 4; ++x)
1661 {
1662 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1663 }
1664 }
1665
1666 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1667 {
1668 DstPtrs startPtrs = ptrs;
1669
1670 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1671 {
1672 // Format conversion and convert from SOA to AOS, and store the rows.
1673 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1674
1675 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1676 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1677 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1678 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1679 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1680 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1681 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1682 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1683 pSrc += SRC_COLUMN_BYTES;
1684 }
1685
1686 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1687 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1688 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1689 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1690 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1691 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1692 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1693 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1694 }
1695 #endif
1696 }
1697 };
1698
1699 //////////////////////////////////////////////////////////////////////////
1700 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1701 //////////////////////////////////////////////////////////////////////////
1702 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1703 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1704 {
1705 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1706 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1707
1708 //////////////////////////////////////////////////////////////////////////
1709 /// @brief Stores an 8x8 raster tile to the destination surface.
1710 /// @param pSrc - Pointer to raster tile.
1711 /// @param pDstSurface - Destination surface state
1712 /// @param x, y - Coordinates to raster tile.
1713 INLINE static void Store(
1714 uint8_t *pSrc,
1715 SWR_SURFACE_STATE* pDstSurface,
1716 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1717 {
1718 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1719
1720 // Punt non-full tiles to generic store
1721 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1722 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1723
1724 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1725 {
1726 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1727 }
1728
1729 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1730 // We can compute the offsets to each column within the raster tile once and increment from these.
1731 #if USE_8x2_TILE_BACKEND
1732 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1733 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1734 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1735
1736 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1737
1738 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1739 uint8_t *ppDsts[] =
1740 {
1741 pDst,
1742 pDst + DestRowWidthBytes,
1743 pDst + DestRowWidthBytes / 4,
1744 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1745 };
1746
1747 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1748 {
1749 // Raster tile width is same as simd16 tile width
1750 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1751
1752 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1753
1754 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1755
1756 ppDsts[0] += dy;
1757 ppDsts[1] += dy;
1758 ppDsts[2] += dy;
1759 ppDsts[3] += dy;
1760 }
1761 #else
1762 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1763 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1764 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1765
1766 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1767 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1768
1769 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1770 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1771 {
1772 uint32_t rowOffset = row * DestRowWidthBytes;
1773
1774 uint8_t* pRow = pCol0 + rowOffset;
1775 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1776
1777 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1778 pSrc += pSrcInc;
1779
1780 ppDsts[0] += DestRowWidthBytes / 4;
1781 ppDsts[1] += DestRowWidthBytes / 4;
1782
1783 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1784 pSrc += pSrcInc;
1785 }
1786 #endif
1787 }
1788 };
1789
1790 //////////////////////////////////////////////////////////////////////////
1791 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1792 //////////////////////////////////////////////////////////////////////////
1793 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1794 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1795 {
1796 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1797 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1798
1799 //////////////////////////////////////////////////////////////////////////
1800 /// @brief Stores an 8x8 raster tile to the destination surface.
1801 /// @param pSrc - Pointer to raster tile.
1802 /// @param pDstSurface - Destination surface state
1803 /// @param x, y - Coordinates to raster tile.
1804 INLINE static void Store(
1805 uint8_t *pSrc,
1806 SWR_SURFACE_STATE* pDstSurface,
1807 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1808 {
1809 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1810
1811 // Punt non-full tiles to generic store
1812 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1813 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1814
1815 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1816 {
1817 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1818 }
1819
1820 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1821 // We can compute the offsets to each column within the raster tile once and increment from these.
1822 #if USE_8x2_TILE_BACKEND
1823 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1824 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1825 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1826
1827 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1828
1829 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1830 uint8_t *ppDsts[] =
1831 {
1832 pDst,
1833 pDst + DestRowWidthBytes,
1834 pDst + DestRowWidthBytes / 2,
1835 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1836 };
1837
1838 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1839 {
1840 // Raster tile width is same as simd16 tile width
1841 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1842
1843 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1844
1845 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1846
1847 ppDsts[0] += dy;
1848 ppDsts[1] += dy;
1849 ppDsts[2] += dy;
1850 ppDsts[3] += dy;
1851 }
1852 #else
1853 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1854 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1855 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1856
1857 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1858 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1859
1860 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1861 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1862 {
1863 uint32_t rowOffset = row * DestRowWidthBytes;
1864
1865 uint8_t* pRow = pCol0 + rowOffset;
1866 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1867
1868 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1869 pSrc += pSrcInc;
1870
1871 ppDsts[0] += DestRowWidthBytes / 2;
1872 ppDsts[1] += DestRowWidthBytes / 2;
1873
1874 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1875 pSrc += pSrcInc;
1876 }
1877 #endif
1878 }
1879 };
1880
1881 //////////////////////////////////////////////////////////////////////////
1882 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1883 //////////////////////////////////////////////////////////////////////////
1884 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1885 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1886 {
1887 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1888 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1889 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1890
1891 //////////////////////////////////////////////////////////////////////////
1892 /// @brief Stores an 8x8 raster tile to the destination surface.
1893 /// @param pSrc - Pointer to raster tile.
1894 /// @param pDstSurface - Destination surface state
1895 /// @param x, y - Coordinates to raster tile.
1896 INLINE static void Store(
1897 uint8_t *pSrc,
1898 SWR_SURFACE_STATE* pDstSurface,
1899 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1900 {
1901 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1902
1903 // Punt non-full tiles to generic store
1904 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1905 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1906
1907 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1908 {
1909 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1910 }
1911
1912 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1913 // We can compute the offsets to each column within the raster tile once and increment from these.
1914 #if USE_8x2_TILE_BACKEND
1915 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1916 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1917
1918 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1919 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1920
1921 uint8_t* ppDsts[] =
1922 {
1923 pDst, // row 0, col 0
1924 pDst + DestRowWidthBytes, // row 1, col 0
1925 pDst + dx / 2, // row 0, col 1
1926 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1927 };
1928
1929 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1930 {
1931 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1932 {
1933 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1934
1935 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1936
1937 ppDsts[0] += dx;
1938 ppDsts[1] += dx;
1939 ppDsts[2] += dx;
1940 ppDsts[3] += dx;
1941 }
1942
1943 ppDsts[0] += dy;
1944 ppDsts[1] += dy;
1945 ppDsts[2] += dy;
1946 ppDsts[3] += dy;
1947 }
1948 #else
1949 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1950 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1951 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1952
1953 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1954 {
1955 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1956 {
1957 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1958
1959 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1960 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1961
1962 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1963 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1964 }
1965
1966 pRow0 += (DestRowWidthBytes * 2);
1967 pRow1 += (DestRowWidthBytes * 2);
1968 }
1969 #endif
1970 }
1971 };
1972
1973 //////////////////////////////////////////////////////////////////////////
1974 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1975 //////////////////////////////////////////////////////////////////////////
1976 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1977 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1978 {
1979 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1980 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1981
1982 //////////////////////////////////////////////////////////////////////////
1983 /// @brief Stores an 8x8 raster tile to the destination surface.
1984 /// @param pSrc - Pointer to raster tile.
1985 /// @param pDstSurface - Destination surface state
1986 /// @param x, y - Coordinates to raster tile.
1987 INLINE static void Store(
1988 uint8_t *pSrc,
1989 SWR_SURFACE_STATE* pDstSurface,
1990 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1991 {
1992 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1993 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1994
1995 // Punt non-full tiles to generic store
1996 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1997 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1998
1999 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2000 {
2001 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2002 }
2003
2004 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2005 // We can compute the offsets to each column within the raster tile once and increment from these.
2006 #if USE_8x2_TILE_BACKEND
2007 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2008 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2009 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2010
2011 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2012 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2013
2014 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2015 uint8_t *ppDsts[] =
2016 {
2017 pDst, // row 0, col 0
2018 pDst + DestRowWidthBytes, // row 1, col 0
2019 pDst + DestColumnBytes, // row 0, col 1
2020 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
2021 };
2022
2023 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2024 {
2025 // Raster tile width is same as simd16 tile width
2026 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2027
2028 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2029
2030 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2031
2032 ppDsts[0] += dy;
2033 ppDsts[1] += dy;
2034 ppDsts[2] += dy;
2035 ppDsts[3] += dy;
2036 }
2037 #else
2038 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2039 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2040 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2041
2042 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2043 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2044
2045 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2046 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2047 {
2048 uint32_t rowOffset = row * DestRowWidthBytes;
2049
2050 uint8_t* pRow = pCol0 + rowOffset;
2051 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
2052
2053 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2054 pSrc += pSrcInc;
2055
2056 ppDsts[0] += DestColumnBytes;
2057 ppDsts[1] += DestColumnBytes;
2058
2059 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2060 pSrc += pSrcInc;
2061 }
2062 #endif
2063 }
2064 };
2065
2066 //////////////////////////////////////////////////////////////////////////
2067 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2068 //////////////////////////////////////////////////////////////////////////
2069 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2070 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2071 {
2072 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2073 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2074
2075 //////////////////////////////////////////////////////////////////////////
2076 /// @brief Stores an 8x8 raster tile to the destination surface.
2077 /// @param pSrc - Pointer to raster tile.
2078 /// @param pDstSurface - Destination surface state
2079 /// @param x, y - Coordinates to raster tile.
2080 INLINE static void Store(
2081 uint8_t *pSrc,
2082 SWR_SURFACE_STATE* pDstSurface,
2083 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2084 {
2085 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2086 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2087
2088 // Punt non-full tiles to generic store
2089 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2090 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2091
2092 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2093 {
2094 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2095 }
2096
2097 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2098 // We can compute the offsets to each column within the raster tile once and increment from these.
2099 #if USE_8x2_TILE_BACKEND
2100 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2101 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2102 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2103
2104 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2105 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2106
2107 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2108 uint8_t *ppDsts[] =
2109 {
2110 pDst, // row 0, col 0
2111 pDst + DestRowWidthBytes, // row 1, col 0
2112 pDst + DestColumnBytes, // row 0, col 1
2113 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2114 pDst + DestColumnBytes * 2, // row 0, col 2
2115 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2116 pDst + DestColumnBytes * 3, // row 0, col 3
2117 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
2118 };
2119
2120 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2121 {
2122 // Raster tile width is same as simd16 tile width
2123 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2124
2125 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2126
2127 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2128
2129 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
2130 {
2131 ppDsts[i] += dy;
2132 }
2133 }
2134 #else
2135 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2136 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2137 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2138 uint8_t* pCol1 = pCol0 + DestColumnBytes;
2139
2140 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2141 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2142 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2143
2144 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2145 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2146 {
2147 uint32_t rowOffset = row * DestRowWidthBytes;
2148 uint8_t* ppDsts[] =
2149 {
2150 pCol0 + rowOffset,
2151 pCol0 + rowOffset + DestRowWidthBytes,
2152 pCol1 + rowOffset,
2153 pCol1 + rowOffset + DestRowWidthBytes,
2154 };
2155
2156 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2157 pSrc += pSrcInc;
2158
2159 ppDsts[0] += DestColumnBytes * 2;
2160 ppDsts[1] += DestColumnBytes * 2;
2161 ppDsts[2] += DestColumnBytes * 2;
2162 ppDsts[3] += DestColumnBytes * 2;
2163
2164 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2165 pSrc += pSrcInc;
2166 }
2167 #endif
2168 }
2169 };
2170
2171 //////////////////////////////////////////////////////////////////////////
2172 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2173 //////////////////////////////////////////////////////////////////////////
2174 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2175 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2176 {
2177 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2178 #if USE_8x2_TILE_BACKEND
2179 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2180
2181 #else
2182 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2183 static const size_t TILE_Y_ROWS = 32;
2184 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2185
2186 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2187 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2188 static const size_t MAX_DST_COLUMN_BYTES = 16;
2189
2190 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2191 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2192
2193 #endif
2194 //////////////////////////////////////////////////////////////////////////
2195 /// @brief Stores an 8x8 raster tile to the destination surface.
2196 /// @param pSrc - Pointer to raster tile.
2197 /// @param pDstSurface - Destination surface state
2198 /// @param x, y - Coordinates to raster tile.
2199 INLINE static void Store(
2200 uint8_t *pSrc,
2201 SWR_SURFACE_STATE* pDstSurface,
2202 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2203 {
2204 #if USE_8x2_TILE_BACKEND
2205 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2206 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2207 #endif
2208
2209 // Punt non-full tiles to generic store
2210 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2211 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2212
2213 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2214 {
2215 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2216 }
2217
2218 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2219 // We can compute the offsets to each column within the raster tile once and increment from these.
2220 #if USE_8x2_TILE_BACKEND
2221 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2222 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2223 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2224
2225 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2226 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2227
2228 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2229 uint8_t *ppDsts[] =
2230 {
2231 pDst, // row 0, col 0
2232 pDst + DestRowWidthBytes, // row 1, col 0
2233 pDst + DestColumnBytes, // row 0, col 1
2234 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2235 pDst + DestColumnBytes * 2, // row 0, col 2
2236 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2237 pDst + DestColumnBytes * 3, // row 0, col 3
2238 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2239 pDst + DestColumnBytes * 4, // row 0, col 4
2240 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2241 pDst + DestColumnBytes * 5, // row 0, col 5
2242 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2243 pDst + DestColumnBytes * 6, // row 0, col 6
2244 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2245 pDst + DestColumnBytes * 7, // row 0, col 7
2246 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
2247 };
2248
2249 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2250 {
2251 // Raster tile width is same as simd16 tile width
2252 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2253
2254 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2255
2256 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2257
2258 for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
2259 {
2260 ppDsts[i] += dy;
2261 }
2262 }
2263 #else
2264 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2265 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2266 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2267 struct DstPtrs
2268 {
2269 uint8_t* ppDsts[8];
2270 } ptrs;
2271
2272 // Need 8 pointers, 4 columns of 2 rows each
2273 for (uint32_t y = 0; y < 2; ++y)
2274 {
2275 for (uint32_t x = 0; x < 4; ++x)
2276 {
2277 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2278 }
2279 }
2280
2281 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2282 {
2283 DstPtrs startPtrs = ptrs;
2284
2285 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2286 {
2287 // Format conversion and convert from SOA to AOS, and store the rows.
2288 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2289
2290 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2291 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2292 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2293 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2294 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2295 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2296 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2297 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2298 pSrc += SRC_COLUMN_BYTES;
2299 }
2300
2301 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2302 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2303 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2304 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2305 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2306 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2307 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2308 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2309 }
2310 #endif
2311 }
2312 };
2313
2314 //////////////////////////////////////////////////////////////////////////
2315 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2316 //////////////////////////////////////////////////////////////////////////
2317 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2318 struct StoreMacroTile
2319 {
2320 //////////////////////////////////////////////////////////////////////////
2321 /// @brief Stores a macrotile to the destination surface using safe implementation.
2322 /// @param pSrc - Pointer to macro tile.
2323 /// @param pDstSurface - Destination surface state
2324 /// @param x, y - Coordinates to macro tile
2325 static void StoreGeneric(
2326 uint8_t *pSrcHotTile,
2327 SWR_SURFACE_STATE* pDstSurface,
2328 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2329 {
2330 PFN_STORE_TILES_INTERNAL pfnStore;
2331 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2332
2333 // Store each raster tile from the hot tile to the destination surface.
2334 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2335 {
2336 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2337 {
2338 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2339 {
2340 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2341 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2342 }
2343 }
2344 }
2345
2346 }
2347
2348 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2349 //////////////////////////////////////////////////////////////////////////
2350 /// @brief Stores a macrotile to the destination surface.
2351 /// @param pSrc - Pointer to macro tile.
2352 /// @param pDstSurface - Destination surface state
2353 /// @param x, y - Coordinates to macro tile
2354 static void Store(
2355 uint8_t *pSrcHotTile,
2356 SWR_SURFACE_STATE* pDstSurface,
2357 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2358 {
2359 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2360
2361 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2362 {
2363 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2364 0,
2365 0,
2366 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2367 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2368 sampleNum,
2369 pDstSurface->lod,
2370 pDstSurface);
2371
2372 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2373 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2374 (pDstSurface->bInterleavedSamples);
2375
2376 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2377 }
2378
2379 // Save original for pSrcHotTile resolve.
2380 uint8_t *pResolveSrcHotTile = pSrcHotTile;
2381
2382 // Store each raster tile from the hot tile to the destination surface.
2383 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2384 {
2385 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2386 {
2387 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2388 {
2389 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2390 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2391 }
2392 }
2393 }
2394
2395 if (pDstSurface->xpAuxBaseAddress)
2396 {
2397 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2398 // Store each raster tile from the hot tile to the destination surface.
2399 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2400 {
2401 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2402 {
2403 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
2404 pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
2405 }
2406 }
2407 }
2408 }
2409 };
2410
2411 //////////////////////////////////////////////////////////////////////////
2412 /// InitStoreTilesTable - Helper for setting up the tables.
2413 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2414 void InitStoreTilesTableColor_Half1(
2415 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2416 {
2417 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2418 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2419 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2420 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2421 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2422 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2423 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2424 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2425 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2426 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2427 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2428 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2429 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2430 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2431 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2432 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2433 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2434 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2435 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2436 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2437 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2438 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2439 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2440 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2441 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2442 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2443 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2444 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2445 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2446 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2447 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2448 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2449 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2450 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2451 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2452 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2453 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2454 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2455 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2456 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2457 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2458 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2459 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2460 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2461 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2462 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2463 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2464 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2465 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2466 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2467 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2468 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2469 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2470 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2471 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2472 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2473 }
2474
2475 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2476 void InitStoreTilesTableColor_Half2(
2477 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2478 {
2479 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2480 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2481 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2482 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2483 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2484 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2485 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2486 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2487 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2488 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2489 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2490 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2491 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2492 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2493 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2494 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2495 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2496 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2497 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2498 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2499 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2500 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2501 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2502 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2503 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2504 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2505 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2506 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2507 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2508 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2509 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2510 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2511 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2512 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2513 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2514 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2515 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2516 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2517 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2518 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2519 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2520 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2521 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2522 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2523 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2524 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2525 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2526 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2527 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2528 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2529 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2530 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2531 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2532 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2533 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2534 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2535 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2536 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2537 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2538 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2539 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2540 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2541 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2542 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2543 }
2544
2545 //////////////////////////////////////////////////////////////////////////
2546 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2547 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2548 void InitStoreTilesTableDepth(
2549 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2550 {
2551 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2552 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2553 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2554 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2555 }
2556
2557 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2558 void InitStoreTilesTableStencil(
2559 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2560 {
2561 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2562 }