swr: [rasterizer memory] fix store tile for 128-bit ymajor tiling
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39
40 #include <array>
41 #include <sstream>
42
43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
45
46 //////////////////////////////////////////////////////////////////////////
47 /// Store Raster Tile Function Tables.
48 //////////////////////////////////////////////////////////////////////////
49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52
53 void InitStoreTilesTable_Linear_1();
54 void InitStoreTilesTable_Linear_2();
55 void InitStoreTilesTable_TileX_1();
56 void InitStoreTilesTable_TileX_2();
57 void InitStoreTilesTable_TileY_1();
58 void InitStoreTilesTable_TileY_2();
59 void InitStoreTilesTable_TileW();
60 void InitStoreTilesTable();
61
62 //////////////////////////////////////////////////////////////////////////
63 /// StorePixels
64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
66 /// @param ppDsts - Array of destination pointers. Each pointer is
67 /// to a single row of at most 16B.
68 /// @tparam NumDests - Number of destination pointers. Each pair of
69 /// pointers is for a 16-byte column of two rows.
70 //////////////////////////////////////////////////////////////////////////
71 template <size_t PixelSize, size_t NumDests>
72 struct StorePixels
73 {
74 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
75 };
76
77 //////////////////////////////////////////////////////////////////////////
78 /// StorePixels (32-bit pixel specialization)
79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
81 /// @param ppDsts - Array of destination pointers. Each pointer is
82 /// to a single row of at most 16B.
83 /// @tparam NumDests - Number of destination pointers. Each pair of
84 /// pointers is for a 16-byte column of two rows.
85 //////////////////////////////////////////////////////////////////////////
86 template <>
87 struct StorePixels<8, 2>
88 {
89 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
90 {
91 // Each 4-pixel row is 4 bytes.
92 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
93
94 // Unswizzle from SWR-Z order
95 uint16_t* pRow = (uint16_t*)ppDsts[0];
96 pRow[0] = pPixSrc[0];
97 pRow[1] = pPixSrc[2];
98
99 pRow = (uint16_t*)ppDsts[1];
100 pRow[0] = pPixSrc[1];
101 pRow[1] = pPixSrc[3];
102 }
103 };
104
105 #if USE_8x2_TILE_BACKEND
106 template <>
107 struct StorePixels<8, 4>
108 {
109 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110 {
111 // 8 x 2 bytes = 16 bytes, 16 pixels
112 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113
114 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115
116 // Unswizzle from SWR-Z order
117 ppDsts16[0][0] = pSrc16[0]; // 0 1
118 ppDsts16[0][1] = pSrc16[2]; // 4 5
119
120 ppDsts16[1][0] = pSrc16[1]; // 2 3
121 ppDsts16[1][1] = pSrc16[3]; // 6 7
122
123 ppDsts16[2][0] = pSrc16[4]; // 8 9
124 ppDsts16[2][1] = pSrc16[6]; // C D
125
126 ppDsts16[3][0] = pSrc16[5]; // A B
127 ppDsts16[3][1] = pSrc16[7]; // E F
128 }
129 };
130
131 #endif
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts - Array of destination pointers. Each pointer is
137 /// to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers. Each pair of
139 /// pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
141 template <>
142 struct StorePixels<16, 2>
143 {
144 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145 {
146 // Each 4-pixel row is 8 bytes.
147 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148
149 // Unswizzle from SWR-Z order
150 uint32_t* pRow = (uint32_t*)ppDsts[0];
151 pRow[0] = pPixSrc[0];
152 pRow[1] = pPixSrc[2];
153
154 pRow = (uint32_t*)ppDsts[1];
155 pRow[0] = pPixSrc[1];
156 pRow[1] = pPixSrc[3];
157 }
158 };
159
160 #if USE_8x2_TILE_BACKEND
161 template <>
162 struct StorePixels<16, 4>
163 {
164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165 {
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171 // Unswizzle from SWR-Z order
172 ppDsts32[0][0] = pSrc32[0]; // 0 1
173 ppDsts32[0][1] = pSrc32[2]; // 4 5
174
175 ppDsts32[1][0] = pSrc32[1]; // 2 3
176 ppDsts32[1][1] = pSrc32[3]; // 6 7
177
178 ppDsts32[2][0] = pSrc32[4]; // 8 9
179 ppDsts32[2][1] = pSrc32[6]; // C D
180
181 ppDsts32[3][0] = pSrc32[5]; // A B
182 ppDsts32[3][1] = pSrc32[7]; // E F
183 }
184 };
185
186 #endif
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts - Array of destination pointers. Each pointer is
192 /// to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers. Each pair of
194 /// pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
196 template <>
197 struct StorePixels<32, 2>
198 {
199 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
200 {
201 // Each 4-pixel row is 16-bytes
202 __m128i *pZRow01 = (__m128i*)pSrc;
203 __m128i vQuad00 = _mm_load_si128(pZRow01);
204 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
205
206 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
207 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
208
209 _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
210 _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
211 }
212 };
213
214 #if USE_8x2_TILE_BACKEND
215 template <>
216 struct StorePixels<32, 4>
217 {
218 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219 {
220 // 4 x 16 bytes = 64 bytes, 16 pixels
221 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
222
223 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
224
225 // Unswizzle from SWR-Z order
226 __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
227 __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
228 __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
229 __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
230
231 _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
232 _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
233 _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
234 _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
235 }
236 };
237
238 #endif
239 //////////////////////////////////////////////////////////////////////////
240 /// StorePixels (32-bit pixel specialization)
241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
243 /// @param ppDsts - Array of destination pointers. Each pointer is
244 /// to a single row of at most 16B.
245 /// @tparam NumDests - Number of destination pointers. Each pair of
246 /// pointers is for a 16-byte column of two rows.
247 //////////////////////////////////////////////////////////////////////////
248 template <>
249 struct StorePixels<64, 4>
250 {
251 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
252 {
253 // Each 4-pixel row is 32 bytes.
254 const __m128i* pPixSrc = (const __m128i*)pSrc;
255
256 // order of pointers match SWR-Z layout
257 __m128i** pvDsts = (__m128i**)&ppDsts[0];
258 *pvDsts[0] = pPixSrc[0];
259 *pvDsts[1] = pPixSrc[1];
260 *pvDsts[2] = pPixSrc[2];
261 *pvDsts[3] = pPixSrc[3];
262 }
263 };
264
265 #if USE_8x2_TILE_BACKEND
266 template <>
267 struct StorePixels<64, 8>
268 {
269 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270 {
271 // 8 x 16 bytes = 128 bytes, 16 pixels
272 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
273
274 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
275
276 // order of pointers match SWR-Z layout
277 *ppDsts128[0] = pSrc128[0]; // 0 1
278 *ppDsts128[1] = pSrc128[1]; // 2 3
279 *ppDsts128[2] = pSrc128[2]; // 4 5
280 *ppDsts128[3] = pSrc128[3]; // 6 7
281 *ppDsts128[4] = pSrc128[4]; // 8 9
282 *ppDsts128[5] = pSrc128[5]; // A B
283 *ppDsts128[6] = pSrc128[6]; // C D
284 *ppDsts128[7] = pSrc128[7]; // E F
285 }
286 };
287
288 #endif
289 //////////////////////////////////////////////////////////////////////////
290 /// StorePixels (32-bit pixel specialization)
291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
293 /// @param ppDsts - Array of destination pointers. Each pointer is
294 /// to a single row of at most 16B.
295 /// @tparam NumDests - Number of destination pointers. Each pair of
296 /// pointers is for a 16-byte column of two rows.
297 //////////////////////////////////////////////////////////////////////////
298 template <>
299 struct StorePixels<128, 8>
300 {
301 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
302 {
303 // Each 4-pixel row is 64 bytes.
304 const __m128i* pPixSrc = (const __m128i*)pSrc;
305
306 // Unswizzle from SWR-Z order
307 __m128i** pvDsts = (__m128i**)&ppDsts[0];
308 *pvDsts[0] = pPixSrc[0];
309 *pvDsts[1] = pPixSrc[2];
310 *pvDsts[2] = pPixSrc[1];
311 *pvDsts[3] = pPixSrc[3];
312 *pvDsts[4] = pPixSrc[4];
313 *pvDsts[5] = pPixSrc[6];
314 *pvDsts[6] = pPixSrc[5];
315 *pvDsts[7] = pPixSrc[7];
316 }
317 };
318
319 #if USE_8x2_TILE_BACKEND
320 template <>
321 struct StorePixels<128, 16>
322 {
323 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324 {
325 // 16 x 16 bytes = 256 bytes, 16 pixels
326 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
327
328 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
329
330 for (uint32_t i = 0; i < 16; i += 4)
331 {
332 *ppDsts128[i + 0] = pSrc128[i + 0];
333 *ppDsts128[i + 1] = pSrc128[i + 2];
334 *ppDsts128[i + 2] = pSrc128[i + 1];
335 *ppDsts128[i + 3] = pSrc128[i + 3];
336 }
337 }
338 };
339
340 #endif
341 //////////////////////////////////////////////////////////////////////////
342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343 //////////////////////////////////////////////////////////////////////////
344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
345 struct ConvertPixelsSOAtoAOS
346 {
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Converts a SIMD from the Hot Tile to the destination format
349 /// and converts from SOA to AOS.
350 /// @param pSrc - Pointer to raster tile.
351 /// @param pDst - Pointer to destination surface or deswizzling buffer.
352 template <size_t NumDests>
353 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
354 {
355 #if USE_8x2_TILE_BACKEND
356 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357
358 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360
361 // Convert from SrcFormat --> DstFormat
362 simd16vector src;
363 LoadSOA<SrcFormat>(pSrc, src);
364 StoreSOA<DstFormat>(src, soaTile);
365
366 // Convert from SOA --> AOS
367 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368
369 #else
370 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
371
372 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
373 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
374
375 // Convert from SrcFormat --> DstFormat
376 simdvector src;
377 LoadSOA<SrcFormat>(pSrc, src);
378 StoreSOA<DstFormat>(src, soaTile);
379
380 // Convert from SOA --> AOS
381 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
382
383 #endif
384 // Store data into destination
385 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
386 }
387 };
388
389 //////////////////////////////////////////////////////////////////////////
390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391 /// Specialization for no format conversion
392 //////////////////////////////////////////////////////////////////////////
393 template<SWR_FORMAT Format>
394 struct ConvertPixelsSOAtoAOS<Format, Format>
395 {
396 //////////////////////////////////////////////////////////////////////////
397 /// @brief Converts a SIMD from the Hot Tile to the destination format
398 /// and converts from SOA to AOS.
399 /// @param pSrc - Pointer to raster tile.
400 /// @param pDst - Pointer to destination surface or deswizzling buffer.
401 template <size_t NumDests>
402 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
403 {
404 #if USE_8x2_TILE_BACKEND
405 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406
407 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408
409 // Convert from SOA --> AOS
410 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411
412 #else
413 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
414
415 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
416
417 // Convert from SOA --> AOS
418 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
419
420 #endif
421 // Store data into destination
422 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
423 }
424 };
425
426 //////////////////////////////////////////////////////////////////////////
427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428 //////////////////////////////////////////////////////////////////////////
429 template<>
430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
431 {
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Converts a SIMD from the Hot Tile to the destination format
434 /// and converts from SOA to AOS.
435 /// @param pSrc - Pointer to raster tile.
436 /// @param pDst - Pointer to destination surface or deswizzling buffer.
437 template <size_t NumDests>
438 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
439 {
440 #if USE_8x2_TILE_BACKEND
441 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443
444 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445
446 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447
448 // Load hot-tile
449 simd16vector src, dst;
450 LoadSOA<SrcFormat>(pSrc, src);
451
452 // deswizzle
453 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456
457 // clamp
458 dst.x = Clamp<DstFormat>(dst.x, 0);
459 dst.y = Clamp<DstFormat>(dst.y, 1);
460 dst.z = Clamp<DstFormat>(dst.z, 2);
461
462 // normalize
463 dst.x = Normalize<DstFormat>(dst.x, 0);
464 dst.y = Normalize<DstFormat>(dst.y, 1);
465 dst.z = Normalize<DstFormat>(dst.z, 2);
466
467 // pack
468 simd16scalari packed = _simd16_castps_si(dst.x);
469
470 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472
473 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475
476 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477 uint32_t *pPacked = (uint32_t*)&packed;
478 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480 {
481 *pAosTile++ = *pPacked++;
482 }
483
484 #else
485 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
486 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
487 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
488
489 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
490
491 // Load hot-tile
492 simdvector src, dst;
493 LoadSOA<SrcFormat>(pSrc, src);
494
495 // deswizzle
496 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
497 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
498 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
499
500 // clamp
501 dst.x = Clamp<DstFormat>(dst.x, 0);
502 dst.y = Clamp<DstFormat>(dst.y, 1);
503 dst.z = Clamp<DstFormat>(dst.z, 2);
504
505 // normalize
506 dst.x = Normalize<DstFormat>(dst.x, 0);
507 dst.y = Normalize<DstFormat>(dst.y, 1);
508 dst.z = Normalize<DstFormat>(dst.z, 2);
509
510 // pack
511 simdscalari packed = _simd_castps_si(dst.x);
512 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
513 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
514 FormatTraits<DstFormat>::GetBPC(1)));
515
516 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517 uint32_t *pPacked = (uint32_t*)&packed;
518 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
519 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
520 {
521 *pAosTile++ = *pPacked++;
522 }
523
524 #endif
525 // Store data into destination
526 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
527 }
528 };
529
530 //////////////////////////////////////////////////////////////////////////
531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532 //////////////////////////////////////////////////////////////////////////
533 template<>
534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
535 {
536 static const SWR_FORMAT SrcFormat = R32_FLOAT;
537 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
538
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Converts a SIMD from the Hot Tile to the destination format
541 /// and converts from SOA to AOS.
542 /// @param pSrc - Pointer to raster tile.
543 /// @param pDst - Pointer to destination surface or deswizzling buffer.
544 template <size_t NumDests>
545 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
546 {
547 #if USE_8x2_TILE_BACKEND
548 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 4; // 16 pixels * 4 bytes per pixel
549
550 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
551 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
552
553 // Convert from SrcFormat --> DstFormat
554 simd16vector src;
555 LoadSOA<SrcFormat>(pSrc, src);
556 StoreSOA<DstFormat>(src, soaTile);
557
558 // Convert from SOA --> AOS
559 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
560
561 // Store data into destination but don't overwrite the X8 bits
562 // Each 4-pixel row is 16-bytes
563
564 simdscalari loadlo = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile));
565 simdscalari loadhi = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile + sizeof(simdscalari)));
566
567 simdscalari templo = _simd_unpacklo_epi64(loadlo, loadhi);
568 simdscalari temphi = _simd_unpackhi_epi64(loadlo, loadhi);
569
570 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
571 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
572
573 simdscalari mask = _simd_set1_epi32(0x00FFFFFF);
574
575 destlo = _simd_or_si(_simd_andnot_si(mask, destlo), _simd_and_si(mask, templo));
576 desthi = _simd_or_si(_simd_andnot_si(mask, desthi), _simd_and_si(mask, templo));
577
578 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo);
579 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi);
580 #else
581 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
582
583 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
584 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
585
586 // Convert from SrcFormat --> DstFormat
587 simdvector src;
588 LoadSOA<SrcFormat>(pSrc, src);
589 StoreSOA<DstFormat>(src, soaTile);
590
591 // Convert from SOA --> AOS
592 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
593
594 // Store data into destination but don't overwrite the X8 bits
595 // Each 4-pixel row is 16-bytes
596 __m128i *pZRow01 = (__m128i*)aosTile;
597 __m128i vQuad00 = _mm_load_si128(pZRow01);
598 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
599
600 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
601 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
602
603 __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
604 __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
605
606 __m128i vMask = _mm_set1_epi32(0xFFFFFF);
607
608 vDst0 = _mm_andnot_si128(vMask, vDst0);
609 vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
610 vDst1 = _mm_andnot_si128(vMask, vDst1);
611 vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
612
613 _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
614 _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
615 #endif
616 }
617 };
618
619 #if USE_8x2_TILE_BACKEND
620 template<SWR_FORMAT DstFormat>
621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622 {
623 // swizzle rgba -> bgra while we load
624 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628
629 // clamp
630 const simd16scalar zero = _simd16_setzero_ps();
631 const simd16scalar ones = _simd16_set1_ps(1.0f);
632
633 comp0 = _simd16_max_ps(comp0, zero);
634 comp0 = _simd16_min_ps(comp0, ones);
635
636 comp1 = _simd16_max_ps(comp1, zero);
637 comp1 = _simd16_min_ps(comp1, ones);
638
639 comp2 = _simd16_max_ps(comp2, zero);
640 comp2 = _simd16_min_ps(comp2, ones);
641
642 comp3 = _simd16_max_ps(comp3, zero);
643 comp3 = _simd16_min_ps(comp3, ones);
644
645 // gamma-correct only rgb
646 if (FormatTraits<DstFormat>::isSRGB)
647 {
648 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651 }
652
653 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658
659 // moving to 16 wide integer vector types
660 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664
665 // SOA to AOS conversion
666 src1 = _simd16_slli_epi32(src1, 8);
667 src2 = _simd16_slli_epi32(src2, 16);
668 src3 = _simd16_slli_epi32(src3, 24);
669
670 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671
672 // de-swizzle conversion
673 #if 1
674 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
676
677 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678
679 #else
680 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681
682 #endif
683 // store 8x2 memory order:
684 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
687 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
688 }
689
690 #endif
691 template<SWR_FORMAT DstFormat>
692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
693 {
694 static const uint32_t offset = sizeof(simdscalar);
695
696 // swizzle rgba -> bgra while we load
697 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
698 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
699 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
700 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
701
702 // clamp
703 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
704 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
705
706 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
707 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
708
709 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
710 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
711
712 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
713 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
714
715 if (FormatTraits<DstFormat>::isSRGB)
716 {
717 // Gamma-correct only rgb
718 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
719 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
720 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
721 }
722
723 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
725 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
726 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
727 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
728
729 // moving to 8 wide integer vector types
730 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
731 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
732 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
733 __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
734
735 #if KNOB_ARCH == KNOB_ARCH_AVX
736
737 // splitting into two sets of 4 wide integer vector types
738 // because AVX doesn't have instructions to support this operation at 8 wide
739 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
740 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
741 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
742 __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
743
744 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
745 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
746 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
747 __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
748
749 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
750 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
751 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
752 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
753 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
754 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
755
756 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
757 srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
758
759 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
760 srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
761
762 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
763 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
764
765 // unpack into rows that get the tiling order correct
766 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
767 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
768
769 __m256i final = _mm256_castsi128_si256(vRow00);
770 final = _mm256_insertf128_si256(final, vRow10, 1);
771
772 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
773
774 // logic is as above, only wider
775 src1 = _mm256_slli_si256(src1, 1);
776 src2 = _mm256_slli_si256(src2, 2);
777 src3 = _mm256_slli_si256(src3, 3);
778
779 src0 = _mm256_or_si256(src0, src1);
780 src2 = _mm256_or_si256(src2, src3);
781
782 __m256i final = _mm256_or_si256(src0, src2);
783 #if 0
784
785 __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
786
787 final = _mm256_permutevar8x32_epi32(final, perm);
788 #else
789
790 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
791 final = _mm256_permute4x64_epi64(final, 0xD8);
792 #endif
793 #endif
794
795 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
796 }
797
798 #if USE_8x2_TILE_BACKEND
799 template<SWR_FORMAT DstFormat>
800 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
801 {
802 // swizzle rgba -> bgra while we load
803 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
804 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
805 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
806
807 // clamp
808 const simd16scalar zero = _simd16_setzero_ps();
809 const simd16scalar ones = _simd16_set1_ps(1.0f);
810
811 comp0 = _simd16_max_ps(comp0, zero);
812 comp0 = _simd16_min_ps(comp0, ones);
813
814 comp1 = _simd16_max_ps(comp1, zero);
815 comp1 = _simd16_min_ps(comp1, ones);
816
817 comp2 = _simd16_max_ps(comp2, zero);
818 comp2 = _simd16_min_ps(comp2, ones);
819
820 // gamma-correct only rgb
821 if (FormatTraits<DstFormat>::isSRGB)
822 {
823 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
824 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
825 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
826 }
827
828 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
829 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
830 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
831 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
832
833 // moving to 16 wide integer vector types
834 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
835 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
836 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
837
838 // SOA to AOS conversion
839 src1 = _simd16_slli_epi32(src1, 8);
840 src2 = _simd16_slli_epi32(src2, 16);
841
842 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
843
844 // de-swizzle conversion
845 #if 1
846 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
847 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
848
849 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
850
851 #else
852 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
853
854 #endif
855 // store 8x2 memory order:
856 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
857 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
858 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
859 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
860 }
861
862 #endif
863 template<SWR_FORMAT DstFormat>
864 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
865 {
866 static const uint32_t offset = sizeof(simdscalar);
867
868 // swizzle rgba -> bgra while we load
869 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
870 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
871 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
872 // clamp
873 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
874 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
875
876 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
877 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
878
879 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
880 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
881
882 if (FormatTraits<DstFormat>::isSRGB)
883 {
884 // Gamma-correct only rgb
885 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
886 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
887 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
888 }
889
890 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
891 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
892 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
893 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
894
895 // moving to 8 wide integer vector types
896 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
897 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
898 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
899
900 #if KNOB_ARCH == KNOB_ARCH_AVX
901
902 // splitting into two sets of 4 wide integer vector types
903 // because AVX doesn't have instructions to support this operation at 8 wide
904 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
905 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
906 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
907
908 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
909 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
910 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
911
912 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
913 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
914 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
915 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
916
917 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
918
919 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
920
921 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
922 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
923
924 // unpack into rows that get the tiling order correct
925 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
926 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
927
928 __m256i final = _mm256_castsi128_si256(vRow00);
929 final = _mm256_insertf128_si256(final, vRow10, 1);
930
931 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
932
933 // logic is as above, only wider
934 src1 = _mm256_slli_si256(src1, 1);
935 src2 = _mm256_slli_si256(src2, 2);
936
937 src0 = _mm256_or_si256(src0, src1);
938
939 __m256i final = _mm256_or_si256(src0, src2);
940
941 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
942 final = _mm256_permute4x64_epi64(final, 0xD8);
943
944 #endif
945
946 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
947 }
948
949 template<>
950 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
951 {
952 template <size_t NumDests>
953 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
954 {
955 #if USE_8x2_TILE_BACKEND
956 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
957 #else
958 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
959 #endif
960 }
961 };
962
963 template<>
964 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
965 {
966 template <size_t NumDests>
967 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
968 {
969 #if USE_8x2_TILE_BACKEND
970 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
971 #else
972 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
973 #endif
974 }
975 };
976
977 template<>
978 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
979 {
980 template <size_t NumDests>
981 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
982 {
983 #if USE_8x2_TILE_BACKEND
984 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
985 #else
986 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
987 #endif
988 }
989 };
990
991 template<>
992 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
993 {
994 template <size_t NumDests>
995 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
996 {
997 #if USE_8x2_TILE_BACKEND
998 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
999 #else
1000 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1001 #endif
1002 }
1003 };
1004
1005 template<>
1006 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1007 {
1008 template <size_t NumDests>
1009 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1010 {
1011 #if USE_8x2_TILE_BACKEND
1012 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1013 #else
1014 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1015 #endif
1016 }
1017 };
1018
1019 template<>
1020 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1021 {
1022 template <size_t NumDests>
1023 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1024 {
1025 #if USE_8x2_TILE_BACKEND
1026 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1027 #else
1028 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1029 #endif
1030 }
1031 };
1032
1033 template<>
1034 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1035 {
1036 template <size_t NumDests>
1037 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1038 {
1039 #if USE_8x2_TILE_BACKEND
1040 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1041 #else
1042 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1043 #endif
1044 }
1045 };
1046
1047 template<>
1048 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1049 {
1050 template <size_t NumDests>
1051 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1052 {
1053 #if USE_8x2_TILE_BACKEND
1054 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1055 #else
1056 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1057 #endif
1058 }
1059 };
1060
1061 //////////////////////////////////////////////////////////////////////////
1062 /// StoreRasterTile
1063 //////////////////////////////////////////////////////////////////////////
1064 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1065 struct StoreRasterTile
1066 {
1067 //////////////////////////////////////////////////////////////////////////
1068 /// @brief Retrieve color from hot tile source which is always float.
1069 /// @param pSrc - Pointer to raster tile.
1070 /// @param x, y - Coordinates to raster tile.
1071 /// @param output - output color
1072 INLINE static void GetSwizzledSrcColor(
1073 uint8_t* pSrc,
1074 uint32_t x, uint32_t y,
1075 float outputColor[4])
1076 {
1077 #if USE_8x2_TILE_BACKEND
1078 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1079
1080 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1081
1082 // Compute which simd tile we're accessing within 8x8 tile.
1083 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1084 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1085
1086 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1087
1088 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1089
1090 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1091 #else
1092 typedef SimdTile<SrcFormat, DstFormat> SimdT;
1093
1094 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1095
1096 // Compute which simd tile we're accessing within 8x8 tile.
1097 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1098 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1099
1100 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1101
1102 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1103
1104 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1105 #endif
1106 }
1107
1108 //////////////////////////////////////////////////////////////////////////
1109 /// @brief Stores an 8x8 raster tile to the destination surface.
1110 /// @param pSrc - Pointer to raster tile.
1111 /// @param pDstSurface - Destination surface state
1112 /// @param x, y - Coordinates to raster tile.
1113 INLINE static void Store(
1114 uint8_t *pSrc,
1115 SWR_SURFACE_STATE* pDstSurface,
1116 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1117 {
1118 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1119 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1120
1121 // For each raster tile pixel (rx, ry)
1122 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1123 {
1124 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1125 {
1126 // Perform bounds checking.
1127 if (((x + rx) < lodWidth) &&
1128 ((y + ry) < lodHeight))
1129 {
1130 float srcColor[4];
1131 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1132
1133 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1134 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1135 sampleNum, pDstSurface->lod, pDstSurface);
1136 {
1137 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1138 }
1139 }
1140 }
1141 }
1142 }
1143 };
1144
1145 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1146 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1147 {};
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1151 //////////////////////////////////////////////////////////////////////////
1152 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1153 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1154 {
1155 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1156 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1157 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1158
1159 //////////////////////////////////////////////////////////////////////////
1160 /// @brief Stores an 8x8 raster tile to the destination surface.
1161 /// @param pSrc - Pointer to raster tile.
1162 /// @param pDstSurface - Destination surface state
1163 /// @param x, y - Coordinates to raster tile.
1164 INLINE static void Store(
1165 uint8_t *pSrc,
1166 SWR_SURFACE_STATE* pDstSurface,
1167 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1168 {
1169 // Punt non-full tiles to generic store
1170 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1171 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1172 if (x + KNOB_TILE_X_DIM > lodWidth ||
1173 y + KNOB_TILE_Y_DIM > lodHeight)
1174 {
1175 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1176 }
1177
1178 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1179 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1180 #if USE_8x2_TILE_BACKEND
1181
1182 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1183 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1184
1185 uint8_t* ppDsts[] =
1186 {
1187 pDst, // row 0, col 0
1188 pDst + pDstSurface->pitch, // row 1, col 0
1189 pDst + dx / 2, // row 0, col 1
1190 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1191 };
1192
1193 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1194 {
1195 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1196 {
1197 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1198
1199 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1200
1201 ppDsts[0] += dx;
1202 ppDsts[1] += dx;
1203 ppDsts[2] += dx;
1204 ppDsts[3] += dx;
1205 }
1206
1207 ppDsts[0] += dy;
1208 ppDsts[1] += dy;
1209 ppDsts[2] += dy;
1210 ppDsts[3] += dy;
1211 }
1212 #else
1213 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1214
1215 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1216 {
1217 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1218
1219 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1220 {
1221 // Format conversion and convert from SOA to AOS, and store the rows.
1222 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1223
1224 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1225 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1226 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1227 }
1228
1229 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1230 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1231 }
1232 #endif
1233 }
1234 };
1235
1236 //////////////////////////////////////////////////////////////////////////
1237 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1238 //////////////////////////////////////////////////////////////////////////
1239 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1240 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1241 {
1242 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1243 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1244 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1245
1246 //////////////////////////////////////////////////////////////////////////
1247 /// @brief Stores an 8x8 raster tile to the destination surface.
1248 /// @param pSrc - Pointer to raster tile.
1249 /// @param pDstSurface - Destination surface state
1250 /// @param x, y - Coordinates to raster tile.
1251 INLINE static void Store(
1252 uint8_t *pSrc,
1253 SWR_SURFACE_STATE* pDstSurface,
1254 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1255 {
1256 // Punt non-full tiles to generic store
1257 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1258 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1259 if (x + KNOB_TILE_X_DIM > lodWidth ||
1260 y + KNOB_TILE_Y_DIM > lodHeight)
1261 {
1262 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1263 }
1264
1265 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1266 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1267 #if USE_8x2_TILE_BACKEND
1268
1269 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1270 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1271
1272 uint8_t* ppDsts[] =
1273 {
1274 pDst, // row 0, col 0
1275 pDst + pDstSurface->pitch, // row 1, col 0
1276 pDst + dx / 2, // row 0, col 1
1277 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1278 };
1279
1280 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1281 {
1282 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1283 {
1284 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1285
1286 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1287
1288 ppDsts[0] += dx;
1289 ppDsts[1] += dx;
1290 ppDsts[2] += dx;
1291 ppDsts[3] += dx;
1292 }
1293
1294 ppDsts[0] += dy;
1295 ppDsts[1] += dy;
1296 ppDsts[2] += dy;
1297 ppDsts[3] += dy;
1298 }
1299 #else
1300 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1301
1302 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1303 {
1304 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1305
1306 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1307 {
1308 // Format conversion and convert from SOA to AOS, and store the rows.
1309 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1310
1311 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1312 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1313 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1314 }
1315
1316 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1317 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1318 }
1319 #endif
1320 }
1321 };
1322
1323 //////////////////////////////////////////////////////////////////////////
1324 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1325 //////////////////////////////////////////////////////////////////////////
1326 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1327 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1328 {
1329 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1330 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1331 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1332
1333 //////////////////////////////////////////////////////////////////////////
1334 /// @brief Stores an 8x8 raster tile to the destination surface.
1335 /// @param pSrc - Pointer to raster tile.
1336 /// @param pDstSurface - Destination surface state
1337 /// @param x, y - Coordinates to raster tile.
1338 INLINE static void Store(
1339 uint8_t *pSrc,
1340 SWR_SURFACE_STATE* pDstSurface,
1341 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1342 {
1343 // Punt non-full tiles to generic store
1344 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1345 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1346 if (x + KNOB_TILE_X_DIM > lodWidth ||
1347 y + KNOB_TILE_Y_DIM > lodHeight)
1348 {
1349 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1350 }
1351
1352 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1353 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1354 #if USE_8x2_TILE_BACKEND
1355
1356 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1357 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1358
1359 uint8_t* ppDsts[] =
1360 {
1361 pDst, // row 0, col 0
1362 pDst + pDstSurface->pitch, // row 1, col 0
1363 pDst + dx / 2, // row 0, col 1
1364 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1365 };
1366
1367 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1368 {
1369 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1370 {
1371 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1372
1373 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1374
1375 ppDsts[0] += dx;
1376 ppDsts[1] += dx;
1377 ppDsts[2] += dx;
1378 ppDsts[3] += dx;
1379 }
1380
1381 ppDsts[0] += dy;
1382 ppDsts[1] += dy;
1383 ppDsts[2] += dy;
1384 ppDsts[3] += dy;
1385 }
1386 #else
1387 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1388
1389 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1390 {
1391 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1392
1393 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1394 {
1395 // Format conversion and convert from SOA to AOS, and store the rows.
1396 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1397
1398 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1399 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1400 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1401 }
1402
1403 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1404 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1405 }
1406 #endif
1407 }
1408 };
1409
1410 //////////////////////////////////////////////////////////////////////////
1411 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1412 //////////////////////////////////////////////////////////////////////////
1413 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat >
1414 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1415 {
1416 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1417 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1418 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1419 static const size_t MAX_DST_COLUMN_BYTES = 16;
1420 #if !USE_8x2_TILE_BACKEND
1421 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1422 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1423 #endif
1424
1425 //////////////////////////////////////////////////////////////////////////
1426 /// @brief Stores an 8x8 raster tile to the destination surface.
1427 /// @param pSrc - Pointer to raster tile.
1428 /// @param pDstSurface - Destination surface state
1429 /// @param x, y - Coordinates to raster tile.
1430 INLINE static void Store(
1431 uint8_t *pSrc,
1432 SWR_SURFACE_STATE* pDstSurface,
1433 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1434 {
1435 // Punt non-full tiles to generic store
1436 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1437 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1438 if (x + KNOB_TILE_X_DIM > lodWidth ||
1439 y + KNOB_TILE_Y_DIM > lodHeight)
1440 {
1441 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1442 }
1443
1444 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1445 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1446 #if USE_8x2_TILE_BACKEND
1447
1448 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1449 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1450
1451 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1452 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1453
1454 #if 1
1455 uint8_t *ppDsts[8];
1456
1457 {
1458 for (uint32_t y = 0; y < 2; y += 1)
1459 {
1460 for (uint32_t x = 0; x < 4; x += 1)
1461 {
1462 ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1463 }
1464 }
1465 }
1466
1467 #else
1468 uint8_t *ppDsts[] =
1469 {
1470 pDst, // row 0, col 0
1471 pDst + pDstSurface->pitch, // row 1, col 0
1472 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1473 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1474 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1475 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1476 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1477 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1478 };
1479
1480 #endif
1481 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1482 {
1483 // Raster tile width is same as simd16 tile width
1484 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1485
1486 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1487
1488 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1489
1490 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1491 {
1492 ppDsts[i] += dy;
1493 }
1494 }
1495 #else
1496 uint8_t* ppDsts[] =
1497 {
1498 pDst, // row 0, col 0
1499 pDst + pDstSurface->pitch, // row 1, col 0
1500 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1501 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1502 };
1503
1504 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1505 {
1506 uint8_t* ppStartRows[] =
1507 {
1508 ppDsts[0],
1509 ppDsts[1],
1510 ppDsts[2],
1511 ppDsts[3],
1512 };
1513
1514 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1515 {
1516 // Format conversion and convert from SOA to AOS, and store the rows.
1517 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1518
1519 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1520 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1521 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1522 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1523 pSrc += SRC_COLUMN_BYTES;
1524 }
1525
1526 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1527 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1528 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1529 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1530 }
1531 #endif
1532 }
1533 };
1534
1535 //////////////////////////////////////////////////////////////////////////
1536 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1537 //////////////////////////////////////////////////////////////////////////
1538 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1539 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1540 {
1541 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1542 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1543 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1544 static const size_t MAX_DST_COLUMN_BYTES = 16;
1545 #if !USE_8x2_TILE_BACKEND
1546 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1547 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1548 #endif
1549
1550 //////////////////////////////////////////////////////////////////////////
1551 /// @brief Stores an 8x8 raster tile to the destination surface.
1552 /// @param pSrc - Pointer to raster tile.
1553 /// @param pDstSurface - Destination surface state
1554 /// @param x, y - Coordinates to raster tile.
1555 INLINE static void Store(
1556 uint8_t *pSrc,
1557 SWR_SURFACE_STATE* pDstSurface,
1558 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1559 {
1560 // Punt non-full tiles to generic store
1561 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1562 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1563 if (x + KNOB_TILE_X_DIM > lodWidth ||
1564 y + KNOB_TILE_Y_DIM > lodHeight)
1565 {
1566 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1567 }
1568
1569 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1570 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1571 #if USE_8x2_TILE_BACKEND
1572
1573 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1574 const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // double up on tile y dim, one simd16 tile will do twice the rows
1575
1576 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1577 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1578
1579 #if 1
1580 uint8_t *ppDsts[16];
1581
1582 {
1583 for (uint32_t y = 0; y < 2; y += 1)
1584 {
1585 for (uint32_t x = 0; x < 4; x += 1)
1586 {
1587 ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1588 ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1589 }
1590 }
1591 }
1592
1593 #else
1594 uint8_t* ppDsts[] =
1595 {
1596 pDst, // row 0, col 0
1597 pDst + pDstSurface->pitch, // row 1, col 0
1598 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1599 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1600 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1601 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1602 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1603 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1604
1605 pDst + pDstSurface->pitch * 2, // row 2, col 0
1606 pDst + pDstSurface->pitch * 3, // row 3, col 0
1607 pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES, // row 2, col 1
1608 pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES, // row 3, col 1
1609 pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2, // row 2, col 2
1610 pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2, // row 3, col 2
1611 pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3, // row 2, col 3
1612 pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3 // row 3, col 3
1613 };
1614
1615 #endif
1616 #if 1
1617 // Raster tile height is quadruple simd16 tile height
1618 static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
1619
1620 // Raster tile width is same as simd16 tile width
1621 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1622
1623 // tile rows 0 thru 3
1624 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1625
1626 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1627
1628 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1629 {
1630 ppDsts[i] += dy;
1631 }
1632
1633 // tile rows 4 thru 7
1634 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1635
1636 #else
1637 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
1638 {
1639 // Raster tile width is same as simd16 tile width
1640 static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1641
1642 // Format conversion, convert from SOA to AOS, and store
1643 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1644
1645 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1646
1647 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1648 {
1649 ppDsts[i] += dy;
1650 }
1651 }
1652
1653 #endif
1654 #else
1655 struct DstPtrs
1656 {
1657 uint8_t* ppDsts[8];
1658 } ptrs;
1659
1660 // Need 8 pointers, 4 columns of 2 rows each
1661 for (uint32_t y = 0; y < 2; ++y)
1662 {
1663 for (uint32_t x = 0; x < 4; ++x)
1664 {
1665 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1666 }
1667 }
1668
1669 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1670 {
1671 DstPtrs startPtrs = ptrs;
1672
1673 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1674 {
1675 // Format conversion and convert from SOA to AOS, and store the rows.
1676 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1677
1678 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1679 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1680 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1681 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1682 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1683 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1684 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1685 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1686 pSrc += SRC_COLUMN_BYTES;
1687 }
1688
1689 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1690 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1691 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1692 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1693 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1694 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1695 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1696 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1697 }
1698 #endif
1699 }
1700 };
1701
1702 //////////////////////////////////////////////////////////////////////////
1703 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1704 //////////////////////////////////////////////////////////////////////////
1705 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1706 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1707 {
1708 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1709 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1710
1711 //////////////////////////////////////////////////////////////////////////
1712 /// @brief Stores an 8x8 raster tile to the destination surface.
1713 /// @param pSrc - Pointer to raster tile.
1714 /// @param pDstSurface - Destination surface state
1715 /// @param x, y - Coordinates to raster tile.
1716 INLINE static void Store(
1717 uint8_t *pSrc,
1718 SWR_SURFACE_STATE* pDstSurface,
1719 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1720 {
1721 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1722
1723 // Punt non-full tiles to generic store
1724 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1725 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1726 if (x + KNOB_TILE_X_DIM > lodWidth ||
1727 y + KNOB_TILE_Y_DIM > lodHeight)
1728 {
1729 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1730 }
1731
1732 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1733 // We can compute the offsets to each column within the raster tile once and increment from these.
1734 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1735 #if USE_8x2_TILE_BACKEND
1736 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1737 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1738
1739 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1740
1741 uint8_t *ppDsts[] =
1742 {
1743 pDst,
1744 pDst + DestRowWidthBytes,
1745 pDst + DestRowWidthBytes / 4,
1746 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1747 };
1748
1749 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1750 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1751 {
1752 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1753
1754 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1755
1756 ppDsts[0] += dy;
1757 ppDsts[1] += dy;
1758 ppDsts[2] += dy;
1759 ppDsts[3] += dy;
1760 }
1761 #else
1762 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1763 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1764
1765 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1766 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1767
1768 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1769 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1770 {
1771 uint32_t rowOffset = row * DestRowWidthBytes;
1772
1773 uint8_t* pRow = pCol0 + rowOffset;
1774 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1775
1776 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1777 pSrc += pSrcInc;
1778
1779 ppDsts[0] += DestRowWidthBytes / 4;
1780 ppDsts[1] += DestRowWidthBytes / 4;
1781
1782 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1783 pSrc += pSrcInc;
1784 }
1785 #endif
1786 }
1787 };
1788
1789 //////////////////////////////////////////////////////////////////////////
1790 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1791 //////////////////////////////////////////////////////////////////////////
1792 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1793 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1794 {
1795 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1796 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1797
1798 //////////////////////////////////////////////////////////////////////////
1799 /// @brief Stores an 8x8 raster tile to the destination surface.
1800 /// @param pSrc - Pointer to raster tile.
1801 /// @param pDstSurface - Destination surface state
1802 /// @param x, y - Coordinates to raster tile.
1803 INLINE static void Store(
1804 uint8_t *pSrc,
1805 SWR_SURFACE_STATE* pDstSurface,
1806 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1807 {
1808 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1809
1810 // Punt non-full tiles to generic store
1811 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1812 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1813 if (x + KNOB_TILE_X_DIM > lodWidth ||
1814 y + KNOB_TILE_Y_DIM > lodHeight)
1815 {
1816 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1817 }
1818
1819 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1820 // We can compute the offsets to each column within the raster tile once and increment from these.
1821 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1822 #if USE_8x2_TILE_BACKEND
1823 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1824 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1825
1826 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1827
1828 uint8_t *ppDsts[] =
1829 {
1830 pDst,
1831 pDst + DestRowWidthBytes,
1832 pDst + DestRowWidthBytes / 2,
1833 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1834 };
1835
1836 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1837 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1838 {
1839 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1840
1841 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1842
1843 ppDsts[0] += dy;
1844 ppDsts[1] += dy;
1845 ppDsts[2] += dy;
1846 ppDsts[3] += dy;
1847 }
1848 #else
1849 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1850 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1851
1852 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1853 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1854
1855 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1856 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1857 {
1858 uint32_t rowOffset = row * DestRowWidthBytes;
1859
1860 uint8_t* pRow = pCol0 + rowOffset;
1861 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1862
1863 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1864 pSrc += pSrcInc;
1865
1866 ppDsts[0] += DestRowWidthBytes / 2;
1867 ppDsts[1] += DestRowWidthBytes / 2;
1868
1869 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1870 pSrc += pSrcInc;
1871 }
1872 #endif
1873 }
1874 };
1875
1876 //////////////////////////////////////////////////////////////////////////
1877 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1878 //////////////////////////////////////////////////////////////////////////
1879 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1880 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1881 {
1882 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1883 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1884 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1885
1886 //////////////////////////////////////////////////////////////////////////
1887 /// @brief Stores an 8x8 raster tile to the destination surface.
1888 /// @param pSrc - Pointer to raster tile.
1889 /// @param pDstSurface - Destination surface state
1890 /// @param x, y - Coordinates to raster tile.
1891 INLINE static void Store(
1892 uint8_t *pSrc,
1893 SWR_SURFACE_STATE* pDstSurface,
1894 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1895 {
1896 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1897
1898 // Punt non-full tiles to generic store
1899 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1900 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1901 if (x + KNOB_TILE_X_DIM > lodWidth ||
1902 y + KNOB_TILE_Y_DIM > lodHeight)
1903 {
1904 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1905 }
1906
1907 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1908 // We can compute the offsets to each column within the raster tile once and increment from these.
1909 #if USE_8x2_TILE_BACKEND
1910 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1911 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1912
1913 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1914 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1915
1916 uint8_t* ppDsts[] =
1917 {
1918 pDst, // row 0, col 0
1919 pDst + DestRowWidthBytes, // row 1, col 0
1920 pDst + dx / 2, // row 0, col 1
1921 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1922 };
1923
1924 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1925 {
1926 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1927 {
1928 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1929
1930 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1931
1932 ppDsts[0] += dx;
1933 ppDsts[1] += dx;
1934 ppDsts[2] += dx;
1935 ppDsts[3] += dx;
1936 }
1937
1938 ppDsts[0] += dy;
1939 ppDsts[1] += dy;
1940 ppDsts[2] += dy;
1941 ppDsts[3] += dy;
1942 }
1943 #else
1944 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1945 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1946 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1947
1948 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1949 {
1950 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1951 {
1952 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1953
1954 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1955 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1956
1957 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1958 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1959 }
1960
1961 pRow0 += (DestRowWidthBytes * 2);
1962 pRow1 += (DestRowWidthBytes * 2);
1963 }
1964 #endif
1965 }
1966 };
1967
1968 //////////////////////////////////////////////////////////////////////////
1969 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1970 //////////////////////////////////////////////////////////////////////////
1971 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1972 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1973 {
1974 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1975 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1976
1977 //////////////////////////////////////////////////////////////////////////
1978 /// @brief Stores an 8x8 raster tile to the destination surface.
1979 /// @param pSrc - Pointer to raster tile.
1980 /// @param pDstSurface - Destination surface state
1981 /// @param x, y - Coordinates to raster tile.
1982 INLINE static void Store(
1983 uint8_t *pSrc,
1984 SWR_SURFACE_STATE* pDstSurface,
1985 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1986 {
1987 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1988 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1989
1990 // Punt non-full tiles to generic store
1991 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1992 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1993 if (x + KNOB_TILE_X_DIM > lodWidth ||
1994 y + KNOB_TILE_Y_DIM > lodHeight)
1995 {
1996 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1997 }
1998
1999 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2000 // We can compute the offsets to each column within the raster tile once and increment from these.
2001 // There will be 2 x 4-wide columns in an 8x8 raster tile.
2002 #if USE_8x2_TILE_BACKEND
2003 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2004 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2005
2006 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2007
2008 uint8_t *ppDsts[] =
2009 {
2010 pDst,
2011 pDst + DestRowWidthBytes,
2012 pDst + DestColumnBytes,
2013 pDst + DestRowWidthBytes + DestColumnBytes
2014 };
2015
2016 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2017 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2018 {
2019 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2020
2021 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2022
2023 ppDsts[0] += dy;
2024 ppDsts[1] += dy;
2025 ppDsts[2] += dy;
2026 ppDsts[3] += dy;
2027 }
2028 #else
2029 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2030 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2031
2032 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2033 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2034
2035 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2036 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2037 {
2038 uint32_t rowOffset = row * DestRowWidthBytes;
2039
2040 uint8_t* pRow = pCol0 + rowOffset;
2041 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
2042
2043 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2044 pSrc += pSrcInc;
2045
2046 ppDsts[0] += DestColumnBytes;
2047 ppDsts[1] += DestColumnBytes;
2048
2049 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2050 pSrc += pSrcInc;
2051 }
2052 #endif
2053 }
2054 };
2055
2056 //////////////////////////////////////////////////////////////////////////
2057 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2058 //////////////////////////////////////////////////////////////////////////
2059 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2060 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2061 {
2062 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2063 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2064
2065 //////////////////////////////////////////////////////////////////////////
2066 /// @brief Stores an 8x8 raster tile to the destination surface.
2067 /// @param pSrc - Pointer to raster tile.
2068 /// @param pDstSurface - Destination surface state
2069 /// @param x, y - Coordinates to raster tile.
2070 INLINE static void Store(
2071 uint8_t *pSrc,
2072 SWR_SURFACE_STATE* pDstSurface,
2073 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2074 {
2075 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2076 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2077
2078 // Punt non-full tiles to generic store
2079 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2080 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2081 if (x + KNOB_TILE_X_DIM > lodWidth ||
2082 y + KNOB_TILE_Y_DIM > lodHeight)
2083 {
2084 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2085 }
2086
2087 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2088 // We can compute the offsets to each column within the raster tile once and increment from these.
2089 // There will be 2 x 4-wide columns in an 8x8 raster tile.
2090 #if USE_8x2_TILE_BACKEND
2091 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2092 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2093
2094 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2095
2096 #if 1
2097 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2098 uint8_t *ppDsts[8];
2099
2100 {
2101 for (uint32_t y = 0; y < 2; y += 1)
2102 {
2103 for (uint32_t x = 0; x < 4; x += 1)
2104 {
2105 ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * DestColumnBytes;
2106 }
2107 }
2108 }
2109
2110 #else
2111 uint8_t *ppDsts[] =
2112 {
2113 pDst,
2114 pDst + DestRowWidthBytes,
2115 pDst + DestColumnBytes,
2116 pDst + DestRowWidthBytes + DestColumnBytes,
2117 pDst + DestColumnBytes * 2,
2118 pDst + DestRowWidthBytes + DestColumnBytes * 2,
2119 pDst + DestColumnBytes * 3,
2120 pDst + DestRowWidthBytes + DestColumnBytes * 3
2121 };
2122
2123 #endif
2124 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2125 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2126 {
2127 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2128
2129 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2130
2131 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2132 {
2133 ppDsts[i] += dy;
2134 }
2135 }
2136 #else
2137 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2138 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2139 uint8_t* pCol1 = pCol0 + DestColumnBytes;
2140
2141 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2142 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2143 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2144
2145 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2146 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2147 {
2148 uint32_t rowOffset = row * DestRowWidthBytes;
2149 uint8_t* ppDsts[] =
2150 {
2151 pCol0 + rowOffset,
2152 pCol0 + rowOffset + DestRowWidthBytes,
2153 pCol1 + rowOffset,
2154 pCol1 + rowOffset + DestRowWidthBytes,
2155 };
2156
2157 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2158 pSrc += pSrcInc;
2159
2160 ppDsts[0] += DestColumnBytes * 2;
2161 ppDsts[1] += DestColumnBytes * 2;
2162 ppDsts[2] += DestColumnBytes * 2;
2163 ppDsts[3] += DestColumnBytes * 2;
2164
2165 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2166 pSrc += pSrcInc;
2167 }
2168 #endif
2169 }
2170 };
2171
2172 //////////////////////////////////////////////////////////////////////////
2173 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2174 //////////////////////////////////////////////////////////////////////////
2175 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2176 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2177 {
2178 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2179 #if USE_8x2_TILE_BACKEND
2180 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2181
2182 #else
2183 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2184 static const size_t TILE_Y_ROWS = 32;
2185 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2186
2187 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2188 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2189 static const size_t MAX_DST_COLUMN_BYTES = 16;
2190
2191 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2192 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2193
2194 #endif
2195 //////////////////////////////////////////////////////////////////////////
2196 /// @brief Stores an 8x8 raster tile to the destination surface.
2197 /// @param pSrc - Pointer to raster tile.
2198 /// @param pDstSurface - Destination surface state
2199 /// @param x, y - Coordinates to raster tile.
2200 INLINE static void Store(
2201 uint8_t *pSrc,
2202 SWR_SURFACE_STATE* pDstSurface,
2203 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2204 {
2205 #if USE_8x2_TILE_BACKEND
2206 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2207 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2208 #endif
2209
2210 // Punt non-full tiles to generic store
2211 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2212 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2213 if (x + KNOB_TILE_X_DIM > lodWidth ||
2214 y + KNOB_TILE_Y_DIM > lodHeight)
2215 {
2216 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2217 }
2218
2219 #if USE_8x2_TILE_BACKEND
2220 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2221 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2222
2223 const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes; // double up on tile y dim, one simd16 tile will do twice the rows
2224
2225 #if 1
2226 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2227 uint8_t *ppDsts[16];
2228
2229 {
2230 for (uint32_t y = 0; y < 2; y += 1)
2231 {
2232 for (uint32_t x = 0; x < 4; x += 1)
2233 {
2234 ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * DestRowWidthBytes + x * DestColumnBytes;
2235 ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * DestRowWidthBytes + x * DestColumnBytes;
2236 }
2237 }
2238 }
2239
2240 #else
2241 uint8_t *ppDsts[] =
2242 {
2243 pDst,
2244 pDst + DestRowWidthBytes,
2245 pDst + DestColumnBytes,
2246 pDst + DestRowWidthBytes + DestColumnBytes,
2247 pDst + DestColumnBytes * 2,
2248 pDst + DestRowWidthBytes + DestColumnBytes * 2,
2249 pDst + DestColumnBytes * 3,
2250 pDst + DestRowWidthBytes + DestColumnBytes * 3,
2251
2252 pDst + DestRowWidthBytes * 2,
2253 pDst + DestRowWidthBytes * 3,
2254 pDst + DestRowWidthBytes * 2 + DestColumnBytes,
2255 pDst + DestRowWidthBytes * 3 + DestColumnBytes,
2256 pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2,
2257 pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2,
2258 pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3,
2259 pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3
2260 };
2261
2262 #endif
2263 #if 1
2264 // Raster tile height is quadruple simd16 tile height
2265 static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
2266
2267 // Raster tile width is same as simd16 tile width
2268 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2269
2270 // tile rows 0 thru 3
2271 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2272
2273 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2274
2275 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2276 {
2277 ppDsts[i] += dy;
2278 }
2279
2280 // tile rows 4 thru 7
2281 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2282 #else
2283 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2284 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
2285 {
2286 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2287
2288 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2289
2290 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2291 {
2292 ppDsts[i] += dy;
2293 }
2294 }
2295 #endif
2296 #else
2297 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2298 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2299 struct DstPtrs
2300 {
2301 uint8_t* ppDsts[8];
2302 } ptrs;
2303
2304 // Need 8 pointers, 4 columns of 2 rows each
2305 for (uint32_t y = 0; y < 2; ++y)
2306 {
2307 for (uint32_t x = 0; x < 4; ++x)
2308 {
2309 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2310 }
2311 }
2312
2313 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2314 {
2315 DstPtrs startPtrs = ptrs;
2316
2317 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2318 {
2319 // Format conversion and convert from SOA to AOS, and store the rows.
2320 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2321
2322 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2323 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2324 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2325 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2326 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2327 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2328 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2329 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2330 pSrc += SRC_COLUMN_BYTES;
2331 }
2332
2333 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2334 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2335 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2336 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2337 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2338 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2339 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2340 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2341 }
2342 #endif
2343 }
2344 };
2345
2346 //////////////////////////////////////////////////////////////////////////
2347 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2348 //////////////////////////////////////////////////////////////////////////
2349 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2350 struct StoreMacroTile
2351 {
2352 //////////////////////////////////////////////////////////////////////////
2353 /// @brief Stores a macrotile to the destination surface using safe implementation.
2354 /// @param pSrc - Pointer to macro tile.
2355 /// @param pDstSurface - Destination surface state
2356 /// @param x, y - Coordinates to macro tile
2357 static void StoreGeneric(
2358 uint8_t *pSrcHotTile,
2359 SWR_SURFACE_STATE* pDstSurface,
2360 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2361 {
2362 PFN_STORE_TILES_INTERNAL pfnStore;
2363 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2364
2365 // Store each raster tile from the hot tile to the destination surface.
2366 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2367 {
2368 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2369 {
2370 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2371 {
2372 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2373 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2374 }
2375 }
2376 }
2377
2378 }
2379
2380 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2381 //////////////////////////////////////////////////////////////////////////
2382 /// @brief Stores a macrotile to the destination surface.
2383 /// @param pSrc - Pointer to macro tile.
2384 /// @param pDstSurface - Destination surface state
2385 /// @param x, y - Coordinates to macro tile
2386 static void Store(
2387 uint8_t *pSrcHotTile,
2388 SWR_SURFACE_STATE* pDstSurface,
2389 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2390 {
2391 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2392 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2393 {
2394 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2395 0,
2396 0,
2397 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2398 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2399 sampleNum,
2400 pDstSurface->lod,
2401 pDstSurface);
2402
2403 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2404 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2405 (pDstSurface->bInterleavedSamples);
2406
2407 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2408 }
2409
2410 // Store each raster tile from the hot tile to the destination surface.
2411 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2412 {
2413 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2414 {
2415 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2416 {
2417 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2418 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2419 }
2420 }
2421 }
2422 }
2423 };
2424
2425 //////////////////////////////////////////////////////////////////////////
2426 /// InitStoreTilesTable - Helper for setting up the tables.
2427 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2428 void InitStoreTilesTableColor_Half1(
2429 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2430 {
2431 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2432 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2433 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2434 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2435 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2436 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2437 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2438 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2439 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2440 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2441 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2442 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2443 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2444 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2445 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2446 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2447 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2448 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2449 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2450 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2451 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2452 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2453 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2454 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2455 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2456 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2457 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2458 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2459 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2460 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2461 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2462 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2463 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2464 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2465 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2466 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2467 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2468 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2469 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2470 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2471 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2472 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2473 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2474 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2475 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2476 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2477 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2478 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2479 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2480 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2481 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2482 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2483 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2484 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2485 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2486 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2487 }
2488
2489 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2490 void InitStoreTilesTableColor_Half2(
2491 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2492 {
2493 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2494 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2495 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2496 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2497 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2498 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2499 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2500 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2501 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2502 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2503 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2504 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2505 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2506 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2507 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2508 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2509 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2510 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2511 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2512 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2513 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2514 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2515 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2516 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2517 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2518 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2519 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2520 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2521 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2522 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2523 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2524 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2525 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2526 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2527 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2528 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2529 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2530 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2531 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2532 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2533 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2534 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2535 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2536 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2537 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2538 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2539 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2540 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2541 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2542 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2543 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2544 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2545 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2546 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2547 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2548 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2549 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2550 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2551 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2552 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2553 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2554 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2555 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2556 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2557 }
2558
2559 //////////////////////////////////////////////////////////////////////////
2560 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2561 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2562 void InitStoreTilesTableDepth(
2563 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2564 {
2565 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2566 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2567 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2568 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2569 }
2570
2571 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2572 void InitStoreTilesTableStencil(
2573 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2574 {
2575 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2576 }