473ebaef751fa6f7c4e4c5bf2e048b391275e041
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39
40 #include <array>
41 #include <sstream>
42
43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
45
46 //////////////////////////////////////////////////////////////////////////
47 /// Store Raster Tile Function Tables.
48 //////////////////////////////////////////////////////////////////////////
49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52
53 void InitStoreTilesTable_Linear_1();
54 void InitStoreTilesTable_Linear_2();
55 void InitStoreTilesTable_TileX_1();
56 void InitStoreTilesTable_TileX_2();
57 void InitStoreTilesTable_TileY_1();
58 void InitStoreTilesTable_TileY_2();
59 void InitStoreTilesTable_TileW();
60 void InitStoreTilesTable();
61
62 //////////////////////////////////////////////////////////////////////////
63 /// StorePixels
64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
66 /// @param ppDsts - Array of destination pointers. Each pointer is
67 /// to a single row of at most 16B.
68 /// @tparam NumDests - Number of destination pointers. Each pair of
69 /// pointers is for a 16-byte column of two rows.
70 //////////////////////////////////////////////////////////////////////////
71 template <size_t PixelSize, size_t NumDests>
72 struct StorePixels
73 {
74 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
75 };
76
77 //////////////////////////////////////////////////////////////////////////
78 /// StorePixels (32-bit pixel specialization)
79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
81 /// @param ppDsts - Array of destination pointers. Each pointer is
82 /// to a single row of at most 16B.
83 /// @tparam NumDests - Number of destination pointers. Each pair of
84 /// pointers is for a 16-byte column of two rows.
85 //////////////////////////////////////////////////////////////////////////
86 template <>
87 struct StorePixels<8, 2>
88 {
89 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
90 {
91 // Each 4-pixel row is 4 bytes.
92 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
93
94 // Unswizzle from SWR-Z order
95 uint16_t* pRow = (uint16_t*)ppDsts[0];
96 pRow[0] = pPixSrc[0];
97 pRow[1] = pPixSrc[2];
98
99 pRow = (uint16_t*)ppDsts[1];
100 pRow[0] = pPixSrc[1];
101 pRow[1] = pPixSrc[3];
102 }
103 };
104
105 #if USE_8x2_TILE_BACKEND
106 template <>
107 struct StorePixels<8, 4>
108 {
109 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110 {
111 // 8 x 2 bytes = 16 bytes, 16 pixels
112 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113
114 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115
116 // Unswizzle from SWR-Z order
117 ppDsts16[0][0] = pSrc16[0]; // 0 1
118 ppDsts16[0][1] = pSrc16[2]; // 4 5
119
120 ppDsts16[1][0] = pSrc16[1]; // 2 3
121 ppDsts16[1][1] = pSrc16[3]; // 6 7
122
123 ppDsts16[2][0] = pSrc16[4]; // 8 9
124 ppDsts16[2][1] = pSrc16[6]; // C D
125
126 ppDsts16[3][0] = pSrc16[5]; // A B
127 ppDsts16[3][1] = pSrc16[7]; // E F
128 }
129 };
130
131 #endif
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts - Array of destination pointers. Each pointer is
137 /// to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers. Each pair of
139 /// pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
141 template <>
142 struct StorePixels<16, 2>
143 {
144 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145 {
146 // Each 4-pixel row is 8 bytes.
147 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148
149 // Unswizzle from SWR-Z order
150 uint32_t* pRow = (uint32_t*)ppDsts[0];
151 pRow[0] = pPixSrc[0];
152 pRow[1] = pPixSrc[2];
153
154 pRow = (uint32_t*)ppDsts[1];
155 pRow[0] = pPixSrc[1];
156 pRow[1] = pPixSrc[3];
157 }
158 };
159
160 #if USE_8x2_TILE_BACKEND
161 template <>
162 struct StorePixels<16, 4>
163 {
164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165 {
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171 // Unswizzle from SWR-Z order
172 ppDsts32[0][0] = pSrc32[0]; // 0 1
173 ppDsts32[0][1] = pSrc32[2]; // 4 5
174
175 ppDsts32[1][0] = pSrc32[1]; // 2 3
176 ppDsts32[1][1] = pSrc32[3]; // 6 7
177
178 ppDsts32[2][0] = pSrc32[4]; // 8 9
179 ppDsts32[2][1] = pSrc32[6]; // C D
180
181 ppDsts32[3][0] = pSrc32[5]; // A B
182 ppDsts32[3][1] = pSrc32[7]; // E F
183 }
184 };
185
186 #endif
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts - Array of destination pointers. Each pointer is
192 /// to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers. Each pair of
194 /// pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
196 template <>
197 struct StorePixels<32, 2>
198 {
199 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
200 {
201 // Each 4-pixel row is 16-bytes
202 __m128i *pZRow01 = (__m128i*)pSrc;
203 __m128i vQuad00 = _mm_load_si128(pZRow01);
204 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
205
206 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
207 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
208
209 _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
210 _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
211 }
212 };
213
214 #if USE_8x2_TILE_BACKEND
215 template <>
216 struct StorePixels<32, 4>
217 {
218 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219 {
220 // 4 x 16 bytes = 64 bytes, 16 pixels
221 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
222
223 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
224
225 // Unswizzle from SWR-Z order
226 __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
227 __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
228 __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
229 __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
230
231 _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
232 _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
233 _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
234 _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
235 }
236 };
237
238 #endif
239 //////////////////////////////////////////////////////////////////////////
240 /// StorePixels (32-bit pixel specialization)
241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
243 /// @param ppDsts - Array of destination pointers. Each pointer is
244 /// to a single row of at most 16B.
245 /// @tparam NumDests - Number of destination pointers. Each pair of
246 /// pointers is for a 16-byte column of two rows.
247 //////////////////////////////////////////////////////////////////////////
248 template <>
249 struct StorePixels<64, 4>
250 {
251 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
252 {
253 // Each 4-pixel row is 32 bytes.
254 const __m128i* pPixSrc = (const __m128i*)pSrc;
255
256 // order of pointers match SWR-Z layout
257 __m128i** pvDsts = (__m128i**)&ppDsts[0];
258 *pvDsts[0] = pPixSrc[0];
259 *pvDsts[1] = pPixSrc[1];
260 *pvDsts[2] = pPixSrc[2];
261 *pvDsts[3] = pPixSrc[3];
262 }
263 };
264
265 #if USE_8x2_TILE_BACKEND
266 template <>
267 struct StorePixels<64, 8>
268 {
269 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270 {
271 // 8 x 16 bytes = 128 bytes, 16 pixels
272 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
273
274 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
275
276 // order of pointers match SWR-Z layout
277 *ppDsts128[0] = pSrc128[0]; // 0 1
278 *ppDsts128[1] = pSrc128[1]; // 2 3
279 *ppDsts128[2] = pSrc128[2]; // 4 5
280 *ppDsts128[3] = pSrc128[3]; // 6 7
281 *ppDsts128[4] = pSrc128[4]; // 8 9
282 *ppDsts128[5] = pSrc128[5]; // A B
283 *ppDsts128[6] = pSrc128[6]; // C D
284 *ppDsts128[7] = pSrc128[7]; // E F
285 }
286 };
287
288 #endif
289 //////////////////////////////////////////////////////////////////////////
290 /// StorePixels (32-bit pixel specialization)
291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
293 /// @param ppDsts - Array of destination pointers. Each pointer is
294 /// to a single row of at most 16B.
295 /// @tparam NumDests - Number of destination pointers. Each pair of
296 /// pointers is for a 16-byte column of two rows.
297 //////////////////////////////////////////////////////////////////////////
298 template <>
299 struct StorePixels<128, 8>
300 {
301 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
302 {
303 // Each 4-pixel row is 64 bytes.
304 const __m128i* pPixSrc = (const __m128i*)pSrc;
305
306 // Unswizzle from SWR-Z order
307 __m128i** pvDsts = (__m128i**)&ppDsts[0];
308 *pvDsts[0] = pPixSrc[0];
309 *pvDsts[1] = pPixSrc[2];
310 *pvDsts[2] = pPixSrc[1];
311 *pvDsts[3] = pPixSrc[3];
312 *pvDsts[4] = pPixSrc[4];
313 *pvDsts[5] = pPixSrc[6];
314 *pvDsts[6] = pPixSrc[5];
315 *pvDsts[7] = pPixSrc[7];
316 }
317 };
318
319 #if USE_8x2_TILE_BACKEND
320 template <>
321 struct StorePixels<128, 16>
322 {
323 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324 {
325 // 16 x 16 bytes = 256 bytes, 16 pixels
326 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
327
328 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
329
330 for (uint32_t i = 0; i < 16; i += 4)
331 {
332 *ppDsts128[i + 0] = pSrc128[i + 0];
333 *ppDsts128[i + 1] = pSrc128[i + 2];
334 *ppDsts128[i + 2] = pSrc128[i + 1];
335 *ppDsts128[i + 3] = pSrc128[i + 3];
336 }
337 }
338 };
339
340 #endif
341 //////////////////////////////////////////////////////////////////////////
342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343 //////////////////////////////////////////////////////////////////////////
344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
345 struct ConvertPixelsSOAtoAOS
346 {
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Converts a SIMD from the Hot Tile to the destination format
349 /// and converts from SOA to AOS.
350 /// @param pSrc - Pointer to raster tile.
351 /// @param pDst - Pointer to destination surface or deswizzling buffer.
352 template <size_t NumDests>
353 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
354 {
355 #if USE_8x2_TILE_BACKEND
356 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357
358 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360
361 // Convert from SrcFormat --> DstFormat
362 simd16vector src;
363 LoadSOA<SrcFormat>(pSrc, src);
364 StoreSOA<DstFormat>(src, soaTile);
365
366 // Convert from SOA --> AOS
367 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368
369 #else
370 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
371
372 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
373 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
374
375 // Convert from SrcFormat --> DstFormat
376 simdvector src;
377 LoadSOA<SrcFormat>(pSrc, src);
378 StoreSOA<DstFormat>(src, soaTile);
379
380 // Convert from SOA --> AOS
381 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
382
383 #endif
384 // Store data into destination
385 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
386 }
387 };
388
389 //////////////////////////////////////////////////////////////////////////
390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391 /// Specialization for no format conversion
392 //////////////////////////////////////////////////////////////////////////
393 template<SWR_FORMAT Format>
394 struct ConvertPixelsSOAtoAOS<Format, Format>
395 {
396 //////////////////////////////////////////////////////////////////////////
397 /// @brief Converts a SIMD from the Hot Tile to the destination format
398 /// and converts from SOA to AOS.
399 /// @param pSrc - Pointer to raster tile.
400 /// @param pDst - Pointer to destination surface or deswizzling buffer.
401 template <size_t NumDests>
402 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
403 {
404 #if USE_8x2_TILE_BACKEND
405 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406
407 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408
409 // Convert from SOA --> AOS
410 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411
412 #else
413 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
414
415 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
416
417 // Convert from SOA --> AOS
418 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
419
420 #endif
421 // Store data into destination
422 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
423 }
424 };
425
426 //////////////////////////////////////////////////////////////////////////
427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428 //////////////////////////////////////////////////////////////////////////
429 template<>
430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
431 {
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Converts a SIMD from the Hot Tile to the destination format
434 /// and converts from SOA to AOS.
435 /// @param pSrc - Pointer to raster tile.
436 /// @param pDst - Pointer to destination surface or deswizzling buffer.
437 template <size_t NumDests>
438 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
439 {
440 #if USE_8x2_TILE_BACKEND
441 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443
444 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445
446 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447
448 // Load hot-tile
449 simd16vector src, dst;
450 LoadSOA<SrcFormat>(pSrc, src);
451
452 // deswizzle
453 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456
457 // clamp
458 dst.x = Clamp<DstFormat>(dst.x, 0);
459 dst.y = Clamp<DstFormat>(dst.y, 1);
460 dst.z = Clamp<DstFormat>(dst.z, 2);
461
462 // normalize
463 dst.x = Normalize<DstFormat>(dst.x, 0);
464 dst.y = Normalize<DstFormat>(dst.y, 1);
465 dst.z = Normalize<DstFormat>(dst.z, 2);
466
467 // pack
468 simd16scalari packed = _simd16_castps_si(dst.x);
469
470 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472
473 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475
476 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477 uint32_t *pPacked = (uint32_t*)&packed;
478 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480 {
481 *pAosTile++ = *pPacked++;
482 }
483
484 #else
485 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
486 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
487 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
488
489 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
490
491 // Load hot-tile
492 simdvector src, dst;
493 LoadSOA<SrcFormat>(pSrc, src);
494
495 // deswizzle
496 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
497 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
498 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
499
500 // clamp
501 dst.x = Clamp<DstFormat>(dst.x, 0);
502 dst.y = Clamp<DstFormat>(dst.y, 1);
503 dst.z = Clamp<DstFormat>(dst.z, 2);
504
505 // normalize
506 dst.x = Normalize<DstFormat>(dst.x, 0);
507 dst.y = Normalize<DstFormat>(dst.y, 1);
508 dst.z = Normalize<DstFormat>(dst.z, 2);
509
510 // pack
511 simdscalari packed = _simd_castps_si(dst.x);
512 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
513 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
514 FormatTraits<DstFormat>::GetBPC(1)));
515
516 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517 uint32_t *pPacked = (uint32_t*)&packed;
518 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
519 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
520 {
521 *pAosTile++ = *pPacked++;
522 }
523
524 #endif
525 // Store data into destination
526 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
527 }
528 };
529
530 //////////////////////////////////////////////////////////////////////////
531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532 //////////////////////////////////////////////////////////////////////////
533 template<>
534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
535 {
536 static const SWR_FORMAT SrcFormat = R32_FLOAT;
537 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
538
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Converts a SIMD from the Hot Tile to the destination format
541 /// and converts from SOA to AOS.
542 /// @param pSrc - Pointer to raster tile.
543 /// @param pDst - Pointer to destination surface or deswizzling buffer.
544 template <size_t NumDests>
545 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
546 {
547 #if USE_8x2_TILE_BACKEND
548 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
549
550 // clamp
551 const simd16scalar zero = _simd16_setzero_ps();
552 const simd16scalar ones = _simd16_set1_ps(1.0f);
553
554 comp = _simd16_max_ps(comp, zero);
555 comp = _simd16_min_ps(comp, ones);
556
557 // normalize
558 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
559
560 simd16scalari temp = _simd16_cvtps_epi32(comp);
561
562 // swizzle
563 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
564
565 // merge/store data into destination but don't overwrite the X8 bits
566 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
567 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
568
569 simd16scalari dest = _simd16_setzero_si();
570
571 dest = _simd16_insert_si(dest, destlo, 0);
572 dest = _simd16_insert_si(dest, desthi, 1);
573
574 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
575
576 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
577
578 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
579 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
580 #else
581 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
582
583 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
584 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
585
586 // Convert from SrcFormat --> DstFormat
587 simdvector src;
588 LoadSOA<SrcFormat>(pSrc, src);
589 StoreSOA<DstFormat>(src, soaTile);
590
591 // Convert from SOA --> AOS
592 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
593
594 // Store data into destination but don't overwrite the X8 bits
595 // Each 4-pixel row is 16-bytes
596 __m128i *pZRow01 = (__m128i*)aosTile;
597 __m128i vQuad00 = _mm_load_si128(pZRow01);
598 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
599
600 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
601 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
602
603 __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
604 __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
605
606 __m128i vMask = _mm_set1_epi32(0xFFFFFF);
607
608 vDst0 = _mm_andnot_si128(vMask, vDst0);
609 vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
610 vDst1 = _mm_andnot_si128(vMask, vDst1);
611 vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
612
613 _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
614 _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
615 #endif
616 }
617 };
618
619 #if USE_8x2_TILE_BACKEND
620 template<SWR_FORMAT DstFormat>
621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622 {
623 // swizzle rgba -> bgra while we load
624 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628
629 // clamp
630 const simd16scalar zero = _simd16_setzero_ps();
631 const simd16scalar ones = _simd16_set1_ps(1.0f);
632
633 comp0 = _simd16_max_ps(comp0, zero);
634 comp0 = _simd16_min_ps(comp0, ones);
635
636 comp1 = _simd16_max_ps(comp1, zero);
637 comp1 = _simd16_min_ps(comp1, ones);
638
639 comp2 = _simd16_max_ps(comp2, zero);
640 comp2 = _simd16_min_ps(comp2, ones);
641
642 comp3 = _simd16_max_ps(comp3, zero);
643 comp3 = _simd16_min_ps(comp3, ones);
644
645 // gamma-correct only rgb
646 if (FormatTraits<DstFormat>::isSRGB)
647 {
648 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651 }
652
653 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658
659 // moving to 16 wide integer vector types
660 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664
665 // SOA to AOS conversion
666 src1 = _simd16_slli_epi32(src1, 8);
667 src2 = _simd16_slli_epi32(src2, 16);
668 src3 = _simd16_slli_epi32(src3, 24);
669
670 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671
672 // de-swizzle conversion
673 #if 1
674 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
676
677 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678
679 #else
680 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681
682 #endif
683 // store 8x2 memory order:
684 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
687 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
688 }
689
690 #endif
691 template<SWR_FORMAT DstFormat>
692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
693 {
694 static const uint32_t offset = sizeof(simdscalar);
695
696 // swizzle rgba -> bgra while we load
697 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
698 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
699 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
700 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
701
702 // clamp
703 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
704 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
705
706 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
707 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
708
709 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
710 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
711
712 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
713 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
714
715 if (FormatTraits<DstFormat>::isSRGB)
716 {
717 // Gamma-correct only rgb
718 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
719 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
720 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
721 }
722
723 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
725 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
726 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
727 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
728
729 // moving to 8 wide integer vector types
730 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
731 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
732 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
733 __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
734
735 #if KNOB_ARCH == KNOB_ARCH_AVX
736
737 // splitting into two sets of 4 wide integer vector types
738 // because AVX doesn't have instructions to support this operation at 8 wide
739 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
740 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
741 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
742 __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
743
744 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
745 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
746 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
747 __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
748
749 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
750 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
751 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
752 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
753 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
754 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
755
756 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
757 srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
758
759 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
760 srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
761
762 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
763 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
764
765 // unpack into rows that get the tiling order correct
766 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
767 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
768
769 __m256i final = _mm256_castsi128_si256(vRow00);
770 final = _mm256_insertf128_si256(final, vRow10, 1);
771
772 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
773
774 // logic is as above, only wider
775 src1 = _mm256_slli_si256(src1, 1);
776 src2 = _mm256_slli_si256(src2, 2);
777 src3 = _mm256_slli_si256(src3, 3);
778
779 src0 = _mm256_or_si256(src0, src1);
780 src2 = _mm256_or_si256(src2, src3);
781
782 __m256i final = _mm256_or_si256(src0, src2);
783 #if 0
784
785 __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
786
787 final = _mm256_permutevar8x32_epi32(final, perm);
788 #else
789
790 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
791 final = _mm256_permute4x64_epi64(final, 0xD8);
792 #endif
793 #endif
794
795 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
796 }
797
798 #if USE_8x2_TILE_BACKEND
799 template<SWR_FORMAT DstFormat>
800 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
801 {
802 // swizzle rgba -> bgra while we load
803 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
804 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
805 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
806
807 // clamp
808 const simd16scalar zero = _simd16_setzero_ps();
809 const simd16scalar ones = _simd16_set1_ps(1.0f);
810
811 comp0 = _simd16_max_ps(comp0, zero);
812 comp0 = _simd16_min_ps(comp0, ones);
813
814 comp1 = _simd16_max_ps(comp1, zero);
815 comp1 = _simd16_min_ps(comp1, ones);
816
817 comp2 = _simd16_max_ps(comp2, zero);
818 comp2 = _simd16_min_ps(comp2, ones);
819
820 // gamma-correct only rgb
821 if (FormatTraits<DstFormat>::isSRGB)
822 {
823 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
824 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
825 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
826 }
827
828 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
829 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
830 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
831 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
832
833 // moving to 16 wide integer vector types
834 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
835 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
836 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
837
838 // SOA to AOS conversion
839 src1 = _simd16_slli_epi32(src1, 8);
840 src2 = _simd16_slli_epi32(src2, 16);
841
842 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
843
844 // de-swizzle conversion
845 #if 1
846 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
847 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
848
849 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
850
851 #else
852 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
853
854 #endif
855 // store 8x2 memory order:
856 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
857 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
858 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
859 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
860 }
861
862 #endif
863 template<SWR_FORMAT DstFormat>
864 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
865 {
866 static const uint32_t offset = sizeof(simdscalar);
867
868 // swizzle rgba -> bgra while we load
869 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
870 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
871 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
872 // clamp
873 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
874 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
875
876 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
877 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
878
879 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
880 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
881
882 if (FormatTraits<DstFormat>::isSRGB)
883 {
884 // Gamma-correct only rgb
885 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
886 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
887 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
888 }
889
890 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
891 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
892 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
893 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
894
895 // moving to 8 wide integer vector types
896 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
897 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
898 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
899
900 #if KNOB_ARCH == KNOB_ARCH_AVX
901
902 // splitting into two sets of 4 wide integer vector types
903 // because AVX doesn't have instructions to support this operation at 8 wide
904 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
905 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
906 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
907
908 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
909 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
910 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
911
912 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
913 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
914 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
915 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
916
917 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
918
919 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
920
921 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
922 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
923
924 // unpack into rows that get the tiling order correct
925 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
926 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
927
928 __m256i final = _mm256_castsi128_si256(vRow00);
929 final = _mm256_insertf128_si256(final, vRow10, 1);
930
931 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
932
933 // logic is as above, only wider
934 src1 = _mm256_slli_si256(src1, 1);
935 src2 = _mm256_slli_si256(src2, 2);
936
937 src0 = _mm256_or_si256(src0, src1);
938
939 __m256i final = _mm256_or_si256(src0, src2);
940
941 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
942 final = _mm256_permute4x64_epi64(final, 0xD8);
943
944 #endif
945
946 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
947 }
948
949 template<>
950 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
951 {
952 template <size_t NumDests>
953 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
954 {
955 #if USE_8x2_TILE_BACKEND
956 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
957 #else
958 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
959 #endif
960 }
961 };
962
963 template<>
964 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
965 {
966 template <size_t NumDests>
967 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
968 {
969 #if USE_8x2_TILE_BACKEND
970 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
971 #else
972 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
973 #endif
974 }
975 };
976
977 template<>
978 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
979 {
980 template <size_t NumDests>
981 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
982 {
983 #if USE_8x2_TILE_BACKEND
984 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
985 #else
986 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
987 #endif
988 }
989 };
990
991 template<>
992 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
993 {
994 template <size_t NumDests>
995 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
996 {
997 #if USE_8x2_TILE_BACKEND
998 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
999 #else
1000 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1001 #endif
1002 }
1003 };
1004
1005 template<>
1006 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1007 {
1008 template <size_t NumDests>
1009 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1010 {
1011 #if USE_8x2_TILE_BACKEND
1012 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1013 #else
1014 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1015 #endif
1016 }
1017 };
1018
1019 template<>
1020 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1021 {
1022 template <size_t NumDests>
1023 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1024 {
1025 #if USE_8x2_TILE_BACKEND
1026 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1027 #else
1028 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1029 #endif
1030 }
1031 };
1032
1033 template<>
1034 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1035 {
1036 template <size_t NumDests>
1037 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1038 {
1039 #if USE_8x2_TILE_BACKEND
1040 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1041 #else
1042 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1043 #endif
1044 }
1045 };
1046
1047 template<>
1048 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1049 {
1050 template <size_t NumDests>
1051 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1052 {
1053 #if USE_8x2_TILE_BACKEND
1054 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1055 #else
1056 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1057 #endif
1058 }
1059 };
1060
1061 //////////////////////////////////////////////////////////////////////////
1062 /// StoreRasterTile
1063 //////////////////////////////////////////////////////////////////////////
1064 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1065 struct StoreRasterTile
1066 {
1067 //////////////////////////////////////////////////////////////////////////
1068 /// @brief Retrieve color from hot tile source which is always float.
1069 /// @param pSrc - Pointer to raster tile.
1070 /// @param x, y - Coordinates to raster tile.
1071 /// @param output - output color
1072 INLINE static void GetSwizzledSrcColor(
1073 uint8_t* pSrc,
1074 uint32_t x, uint32_t y,
1075 float outputColor[4])
1076 {
1077 #if USE_8x2_TILE_BACKEND
1078 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1079
1080 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1081
1082 // Compute which simd tile we're accessing within 8x8 tile.
1083 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1084 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1085
1086 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1087
1088 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1089
1090 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1091 #else
1092 typedef SimdTile<SrcFormat, DstFormat> SimdT;
1093
1094 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1095
1096 // Compute which simd tile we're accessing within 8x8 tile.
1097 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1098 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1099
1100 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1101
1102 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1103
1104 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1105 #endif
1106 }
1107
1108 //////////////////////////////////////////////////////////////////////////
1109 /// @brief Stores an 8x8 raster tile to the destination surface.
1110 /// @param pSrc - Pointer to raster tile.
1111 /// @param pDstSurface - Destination surface state
1112 /// @param x, y - Coordinates to raster tile.
1113 INLINE static void Store(
1114 uint8_t *pSrc,
1115 SWR_SURFACE_STATE* pDstSurface,
1116 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1117 {
1118 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1119 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1120
1121 // For each raster tile pixel (rx, ry)
1122 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1123 {
1124 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1125 {
1126 // Perform bounds checking.
1127 if (((x + rx) < lodWidth) &&
1128 ((y + ry) < lodHeight))
1129 {
1130 float srcColor[4];
1131 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1132
1133 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1134 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1135 sampleNum, pDstSurface->lod, pDstSurface);
1136 {
1137 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1138 }
1139 }
1140 }
1141 }
1142 }
1143 };
1144
1145 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1146 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1147 {};
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1151 //////////////////////////////////////////////////////////////////////////
1152 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1153 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1154 {
1155 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1156 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1157 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1158
1159 //////////////////////////////////////////////////////////////////////////
1160 /// @brief Stores an 8x8 raster tile to the destination surface.
1161 /// @param pSrc - Pointer to raster tile.
1162 /// @param pDstSurface - Destination surface state
1163 /// @param x, y - Coordinates to raster tile.
1164 INLINE static void Store(
1165 uint8_t *pSrc,
1166 SWR_SURFACE_STATE* pDstSurface,
1167 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1168 {
1169 // Punt non-full tiles to generic store
1170 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1171 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1172
1173 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1174 {
1175 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1176 }
1177
1178 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1179 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1180 #if USE_8x2_TILE_BACKEND
1181
1182 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1183 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1184
1185 uint8_t* ppDsts[] =
1186 {
1187 pDst, // row 0, col 0
1188 pDst + pDstSurface->pitch, // row 1, col 0
1189 pDst + dx / 2, // row 0, col 1
1190 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1191 };
1192
1193 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1194 {
1195 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1196 {
1197 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1198
1199 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1200
1201 ppDsts[0] += dx;
1202 ppDsts[1] += dx;
1203 ppDsts[2] += dx;
1204 ppDsts[3] += dx;
1205 }
1206
1207 ppDsts[0] += dy;
1208 ppDsts[1] += dy;
1209 ppDsts[2] += dy;
1210 ppDsts[3] += dy;
1211 }
1212 #else
1213 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1214
1215 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1216 {
1217 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1218
1219 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1220 {
1221 // Format conversion and convert from SOA to AOS, and store the rows.
1222 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1223
1224 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1225 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1226 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1227 }
1228
1229 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1230 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1231 }
1232 #endif
1233 }
1234 };
1235
1236 //////////////////////////////////////////////////////////////////////////
1237 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1238 //////////////////////////////////////////////////////////////////////////
1239 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1240 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1241 {
1242 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1243 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1244 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1245
1246 //////////////////////////////////////////////////////////////////////////
1247 /// @brief Stores an 8x8 raster tile to the destination surface.
1248 /// @param pSrc - Pointer to raster tile.
1249 /// @param pDstSurface - Destination surface state
1250 /// @param x, y - Coordinates to raster tile.
1251 INLINE static void Store(
1252 uint8_t *pSrc,
1253 SWR_SURFACE_STATE* pDstSurface,
1254 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1255 {
1256 // Punt non-full tiles to generic store
1257 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1258 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1259
1260 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1261 {
1262 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1263 }
1264
1265 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1266 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1267 #if USE_8x2_TILE_BACKEND
1268
1269 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1270 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1271
1272 uint8_t* ppDsts[] =
1273 {
1274 pDst, // row 0, col 0
1275 pDst + pDstSurface->pitch, // row 1, col 0
1276 pDst + dx / 2, // row 0, col 1
1277 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1278 };
1279
1280 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1281 {
1282 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1283 {
1284 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1285
1286 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1287
1288 ppDsts[0] += dx;
1289 ppDsts[1] += dx;
1290 ppDsts[2] += dx;
1291 ppDsts[3] += dx;
1292 }
1293
1294 ppDsts[0] += dy;
1295 ppDsts[1] += dy;
1296 ppDsts[2] += dy;
1297 ppDsts[3] += dy;
1298 }
1299 #else
1300 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1301
1302 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1303 {
1304 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1305
1306 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1307 {
1308 // Format conversion and convert from SOA to AOS, and store the rows.
1309 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1310
1311 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1312 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1313 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1314 }
1315
1316 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1317 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1318 }
1319 #endif
1320 }
1321 };
1322
1323 //////////////////////////////////////////////////////////////////////////
1324 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1325 //////////////////////////////////////////////////////////////////////////
1326 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1327 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1328 {
1329 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1330 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1331 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1332
1333 //////////////////////////////////////////////////////////////////////////
1334 /// @brief Stores an 8x8 raster tile to the destination surface.
1335 /// @param pSrc - Pointer to raster tile.
1336 /// @param pDstSurface - Destination surface state
1337 /// @param x, y - Coordinates to raster tile.
1338 INLINE static void Store(
1339 uint8_t *pSrc,
1340 SWR_SURFACE_STATE* pDstSurface,
1341 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1342 {
1343 // Punt non-full tiles to generic store
1344 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1345 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1346
1347 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1348 {
1349 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1350 }
1351
1352 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1353 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1354 #if USE_8x2_TILE_BACKEND
1355
1356 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1357 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1358
1359 uint8_t* ppDsts[] =
1360 {
1361 pDst, // row 0, col 0
1362 pDst + pDstSurface->pitch, // row 1, col 0
1363 pDst + dx / 2, // row 0, col 1
1364 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1365 };
1366
1367 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1368 {
1369 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1370 {
1371 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1372
1373 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1374
1375 ppDsts[0] += dx;
1376 ppDsts[1] += dx;
1377 ppDsts[2] += dx;
1378 ppDsts[3] += dx;
1379 }
1380
1381 ppDsts[0] += dy;
1382 ppDsts[1] += dy;
1383 ppDsts[2] += dy;
1384 ppDsts[3] += dy;
1385 }
1386 #else
1387 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1388
1389 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1390 {
1391 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1392
1393 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1394 {
1395 // Format conversion and convert from SOA to AOS, and store the rows.
1396 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1397
1398 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1399 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1400 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1401 }
1402
1403 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1404 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1405 }
1406 #endif
1407 }
1408 };
1409
1410 //////////////////////////////////////////////////////////////////////////
1411 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1412 //////////////////////////////////////////////////////////////////////////
1413 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1414 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1415 {
1416 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1417 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1418 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1419 static const size_t MAX_DST_COLUMN_BYTES = 16;
1420 #if !USE_8x2_TILE_BACKEND
1421 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1422 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1423 #endif
1424
1425 //////////////////////////////////////////////////////////////////////////
1426 /// @brief Stores an 8x8 raster tile to the destination surface.
1427 /// @param pSrc - Pointer to raster tile.
1428 /// @param pDstSurface - Destination surface state
1429 /// @param x, y - Coordinates to raster tile.
1430 INLINE static void Store(
1431 uint8_t *pSrc,
1432 SWR_SURFACE_STATE* pDstSurface,
1433 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1434 {
1435 // Punt non-full tiles to generic store
1436 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1437 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1438
1439 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1440 {
1441 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1442 }
1443
1444 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1445 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1446 #if USE_8x2_TILE_BACKEND
1447
1448 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1449 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1450
1451 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1452 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1453
1454 uint8_t *ppDsts[] =
1455 {
1456 pDst, // row 0, col 0
1457 pDst + pDstSurface->pitch, // row 1, col 0
1458 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1459 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1460 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1461 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1462 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1463 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1464 };
1465
1466 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1467 {
1468 // Raster tile width is same as simd16 tile width
1469 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1470
1471 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1472
1473 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1474
1475 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1476 {
1477 ppDsts[i] += dy;
1478 }
1479 }
1480 #else
1481 uint8_t* ppDsts[] =
1482 {
1483 pDst, // row 0, col 0
1484 pDst + pDstSurface->pitch, // row 1, col 0
1485 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1486 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1487 };
1488
1489 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1490 {
1491 uint8_t* ppStartRows[] =
1492 {
1493 ppDsts[0],
1494 ppDsts[1],
1495 ppDsts[2],
1496 ppDsts[3],
1497 };
1498
1499 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1500 {
1501 // Format conversion and convert from SOA to AOS, and store the rows.
1502 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1503
1504 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1505 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1506 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1507 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1508 pSrc += SRC_COLUMN_BYTES;
1509 }
1510
1511 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1512 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1513 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1514 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1515 }
1516 #endif
1517 }
1518 };
1519
1520 //////////////////////////////////////////////////////////////////////////
1521 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1522 //////////////////////////////////////////////////////////////////////////
1523 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1524 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1525 {
1526 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1527 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1528 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1529 static const size_t MAX_DST_COLUMN_BYTES = 16;
1530 #if !USE_8x2_TILE_BACKEND
1531 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1532 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1533 #endif
1534
1535 //////////////////////////////////////////////////////////////////////////
1536 /// @brief Stores an 8x8 raster tile to the destination surface.
1537 /// @param pSrc - Pointer to raster tile.
1538 /// @param pDstSurface - Destination surface state
1539 /// @param x, y - Coordinates to raster tile.
1540 INLINE static void Store(
1541 uint8_t *pSrc,
1542 SWR_SURFACE_STATE* pDstSurface,
1543 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1544 {
1545 // Punt non-full tiles to generic store
1546 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1547 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1548
1549 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1550 {
1551 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1552 }
1553
1554 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1555 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1556 #if USE_8x2_TILE_BACKEND
1557
1558 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1559 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1560
1561 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1562 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1563
1564 uint8_t* ppDsts[] =
1565 {
1566 pDst, // row 0, col 0
1567 pDst + pDstSurface->pitch, // row 1, col 0
1568 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1569 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1570 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1571 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1572 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1573 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1574 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1575 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1576 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1577 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1578 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1579 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1580 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1581 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1582 };
1583
1584 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1585 {
1586 // Raster tile width is same as simd16 tile width
1587 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1588
1589 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1590
1591 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1592
1593 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1594 {
1595 ppDsts[i] += dy;
1596 }
1597 }
1598 #else
1599 struct DstPtrs
1600 {
1601 uint8_t* ppDsts[8];
1602 } ptrs;
1603
1604 // Need 8 pointers, 4 columns of 2 rows each
1605 for (uint32_t y = 0; y < 2; ++y)
1606 {
1607 for (uint32_t x = 0; x < 4; ++x)
1608 {
1609 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1610 }
1611 }
1612
1613 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1614 {
1615 DstPtrs startPtrs = ptrs;
1616
1617 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1618 {
1619 // Format conversion and convert from SOA to AOS, and store the rows.
1620 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1621
1622 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1623 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1624 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1625 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1626 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1627 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1628 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1629 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1630 pSrc += SRC_COLUMN_BYTES;
1631 }
1632
1633 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1634 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1635 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1636 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1637 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1638 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1639 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1640 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1641 }
1642 #endif
1643 }
1644 };
1645
1646 //////////////////////////////////////////////////////////////////////////
1647 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1648 //////////////////////////////////////////////////////////////////////////
1649 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1650 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1651 {
1652 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1653 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1654
1655 //////////////////////////////////////////////////////////////////////////
1656 /// @brief Stores an 8x8 raster tile to the destination surface.
1657 /// @param pSrc - Pointer to raster tile.
1658 /// @param pDstSurface - Destination surface state
1659 /// @param x, y - Coordinates to raster tile.
1660 INLINE static void Store(
1661 uint8_t *pSrc,
1662 SWR_SURFACE_STATE* pDstSurface,
1663 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1664 {
1665 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1666
1667 // Punt non-full tiles to generic store
1668 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1669 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1670
1671 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1672 {
1673 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1674 }
1675
1676 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1677 // We can compute the offsets to each column within the raster tile once and increment from these.
1678 #if USE_8x2_TILE_BACKEND
1679 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1680 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1681 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1682
1683 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1684
1685 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1686 uint8_t *ppDsts[] =
1687 {
1688 pDst,
1689 pDst + DestRowWidthBytes,
1690 pDst + DestRowWidthBytes / 4,
1691 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1692 };
1693
1694 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1695 {
1696 // Raster tile width is same as simd16 tile width
1697 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1698
1699 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1700
1701 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1702
1703 ppDsts[0] += dy;
1704 ppDsts[1] += dy;
1705 ppDsts[2] += dy;
1706 ppDsts[3] += dy;
1707 }
1708 #else
1709 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1710 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1711 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1712
1713 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1714 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1715
1716 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1717 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1718 {
1719 uint32_t rowOffset = row * DestRowWidthBytes;
1720
1721 uint8_t* pRow = pCol0 + rowOffset;
1722 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1723
1724 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1725 pSrc += pSrcInc;
1726
1727 ppDsts[0] += DestRowWidthBytes / 4;
1728 ppDsts[1] += DestRowWidthBytes / 4;
1729
1730 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1731 pSrc += pSrcInc;
1732 }
1733 #endif
1734 }
1735 };
1736
1737 //////////////////////////////////////////////////////////////////////////
1738 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1739 //////////////////////////////////////////////////////////////////////////
1740 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1741 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1742 {
1743 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1744 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1745
1746 //////////////////////////////////////////////////////////////////////////
1747 /// @brief Stores an 8x8 raster tile to the destination surface.
1748 /// @param pSrc - Pointer to raster tile.
1749 /// @param pDstSurface - Destination surface state
1750 /// @param x, y - Coordinates to raster tile.
1751 INLINE static void Store(
1752 uint8_t *pSrc,
1753 SWR_SURFACE_STATE* pDstSurface,
1754 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1755 {
1756 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1757
1758 // Punt non-full tiles to generic store
1759 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1760 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1761
1762 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1763 {
1764 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1765 }
1766
1767 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1768 // We can compute the offsets to each column within the raster tile once and increment from these.
1769 #if USE_8x2_TILE_BACKEND
1770 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1771 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1772 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1773
1774 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1775
1776 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1777 uint8_t *ppDsts[] =
1778 {
1779 pDst,
1780 pDst + DestRowWidthBytes,
1781 pDst + DestRowWidthBytes / 2,
1782 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1783 };
1784
1785 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1786 {
1787 // Raster tile width is same as simd16 tile width
1788 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1789
1790 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1791
1792 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1793
1794 ppDsts[0] += dy;
1795 ppDsts[1] += dy;
1796 ppDsts[2] += dy;
1797 ppDsts[3] += dy;
1798 }
1799 #else
1800 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1801 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1802 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1803
1804 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1805 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1806
1807 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1808 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1809 {
1810 uint32_t rowOffset = row * DestRowWidthBytes;
1811
1812 uint8_t* pRow = pCol0 + rowOffset;
1813 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1814
1815 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1816 pSrc += pSrcInc;
1817
1818 ppDsts[0] += DestRowWidthBytes / 2;
1819 ppDsts[1] += DestRowWidthBytes / 2;
1820
1821 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1822 pSrc += pSrcInc;
1823 }
1824 #endif
1825 }
1826 };
1827
1828 //////////////////////////////////////////////////////////////////////////
1829 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1830 //////////////////////////////////////////////////////////////////////////
1831 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1832 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1833 {
1834 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1835 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1836 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1837
1838 //////////////////////////////////////////////////////////////////////////
1839 /// @brief Stores an 8x8 raster tile to the destination surface.
1840 /// @param pSrc - Pointer to raster tile.
1841 /// @param pDstSurface - Destination surface state
1842 /// @param x, y - Coordinates to raster tile.
1843 INLINE static void Store(
1844 uint8_t *pSrc,
1845 SWR_SURFACE_STATE* pDstSurface,
1846 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1847 {
1848 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1849
1850 // Punt non-full tiles to generic store
1851 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1852 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1853
1854 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1855 {
1856 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1857 }
1858
1859 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1860 // We can compute the offsets to each column within the raster tile once and increment from these.
1861 #if USE_8x2_TILE_BACKEND
1862 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1863 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1864
1865 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1866 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1867
1868 uint8_t* ppDsts[] =
1869 {
1870 pDst, // row 0, col 0
1871 pDst + DestRowWidthBytes, // row 1, col 0
1872 pDst + dx / 2, // row 0, col 1
1873 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1874 };
1875
1876 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1877 {
1878 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1879 {
1880 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1881
1882 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1883
1884 ppDsts[0] += dx;
1885 ppDsts[1] += dx;
1886 ppDsts[2] += dx;
1887 ppDsts[3] += dx;
1888 }
1889
1890 ppDsts[0] += dy;
1891 ppDsts[1] += dy;
1892 ppDsts[2] += dy;
1893 ppDsts[3] += dy;
1894 }
1895 #else
1896 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1897 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1898 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1899
1900 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1901 {
1902 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1903 {
1904 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1905
1906 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1907 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1908
1909 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1910 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1911 }
1912
1913 pRow0 += (DestRowWidthBytes * 2);
1914 pRow1 += (DestRowWidthBytes * 2);
1915 }
1916 #endif
1917 }
1918 };
1919
1920 //////////////////////////////////////////////////////////////////////////
1921 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1922 //////////////////////////////////////////////////////////////////////////
1923 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1924 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1925 {
1926 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1927 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1928
1929 //////////////////////////////////////////////////////////////////////////
1930 /// @brief Stores an 8x8 raster tile to the destination surface.
1931 /// @param pSrc - Pointer to raster tile.
1932 /// @param pDstSurface - Destination surface state
1933 /// @param x, y - Coordinates to raster tile.
1934 INLINE static void Store(
1935 uint8_t *pSrc,
1936 SWR_SURFACE_STATE* pDstSurface,
1937 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1938 {
1939 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1940 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1941
1942 // Punt non-full tiles to generic store
1943 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1944 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1945
1946 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1947 {
1948 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1949 }
1950
1951 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1952 // We can compute the offsets to each column within the raster tile once and increment from these.
1953 #if USE_8x2_TILE_BACKEND
1954 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1955 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1956 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1957
1958 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1959 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1960
1961 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1962 uint8_t *ppDsts[] =
1963 {
1964 pDst, // row 0, col 0
1965 pDst + DestRowWidthBytes, // row 1, col 0
1966 pDst + DestColumnBytes, // row 0, col 1
1967 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
1968 };
1969
1970 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1971 {
1972 // Raster tile width is same as simd16 tile width
1973 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1974
1975 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1976
1977 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1978
1979 ppDsts[0] += dy;
1980 ppDsts[1] += dy;
1981 ppDsts[2] += dy;
1982 ppDsts[3] += dy;
1983 }
1984 #else
1985 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1986 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1987 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1988
1989 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1990 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1991
1992 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1993 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1994 {
1995 uint32_t rowOffset = row * DestRowWidthBytes;
1996
1997 uint8_t* pRow = pCol0 + rowOffset;
1998 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1999
2000 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2001 pSrc += pSrcInc;
2002
2003 ppDsts[0] += DestColumnBytes;
2004 ppDsts[1] += DestColumnBytes;
2005
2006 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2007 pSrc += pSrcInc;
2008 }
2009 #endif
2010 }
2011 };
2012
2013 //////////////////////////////////////////////////////////////////////////
2014 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2015 //////////////////////////////////////////////////////////////////////////
2016 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2017 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2018 {
2019 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2020 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2021
2022 //////////////////////////////////////////////////////////////////////////
2023 /// @brief Stores an 8x8 raster tile to the destination surface.
2024 /// @param pSrc - Pointer to raster tile.
2025 /// @param pDstSurface - Destination surface state
2026 /// @param x, y - Coordinates to raster tile.
2027 INLINE static void Store(
2028 uint8_t *pSrc,
2029 SWR_SURFACE_STATE* pDstSurface,
2030 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2031 {
2032 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2033 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2034
2035 // Punt non-full tiles to generic store
2036 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2037 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2038
2039 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2040 {
2041 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2042 }
2043
2044 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2045 // We can compute the offsets to each column within the raster tile once and increment from these.
2046 #if USE_8x2_TILE_BACKEND
2047 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2048 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2049 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2050
2051 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2052 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2053
2054 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2055 uint8_t *ppDsts[] =
2056 {
2057 pDst, // row 0, col 0
2058 pDst + DestRowWidthBytes, // row 1, col 0
2059 pDst + DestColumnBytes, // row 0, col 1
2060 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2061 pDst + DestColumnBytes * 2, // row 0, col 2
2062 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2063 pDst + DestColumnBytes * 3, // row 0, col 3
2064 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
2065 };
2066
2067 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2068 {
2069 // Raster tile width is same as simd16 tile width
2070 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2071
2072 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2073
2074 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2075
2076 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2077 {
2078 ppDsts[i] += dy;
2079 }
2080 }
2081 #else
2082 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2083 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2084 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2085 uint8_t* pCol1 = pCol0 + DestColumnBytes;
2086
2087 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2088 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2089 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2090
2091 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2092 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2093 {
2094 uint32_t rowOffset = row * DestRowWidthBytes;
2095 uint8_t* ppDsts[] =
2096 {
2097 pCol0 + rowOffset,
2098 pCol0 + rowOffset + DestRowWidthBytes,
2099 pCol1 + rowOffset,
2100 pCol1 + rowOffset + DestRowWidthBytes,
2101 };
2102
2103 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2104 pSrc += pSrcInc;
2105
2106 ppDsts[0] += DestColumnBytes * 2;
2107 ppDsts[1] += DestColumnBytes * 2;
2108 ppDsts[2] += DestColumnBytes * 2;
2109 ppDsts[3] += DestColumnBytes * 2;
2110
2111 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2112 pSrc += pSrcInc;
2113 }
2114 #endif
2115 }
2116 };
2117
2118 //////////////////////////////////////////////////////////////////////////
2119 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2120 //////////////////////////////////////////////////////////////////////////
2121 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2122 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2123 {
2124 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2125 #if USE_8x2_TILE_BACKEND
2126 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2127
2128 #else
2129 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2130 static const size_t TILE_Y_ROWS = 32;
2131 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2132
2133 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2134 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2135 static const size_t MAX_DST_COLUMN_BYTES = 16;
2136
2137 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2138 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2139
2140 #endif
2141 //////////////////////////////////////////////////////////////////////////
2142 /// @brief Stores an 8x8 raster tile to the destination surface.
2143 /// @param pSrc - Pointer to raster tile.
2144 /// @param pDstSurface - Destination surface state
2145 /// @param x, y - Coordinates to raster tile.
2146 INLINE static void Store(
2147 uint8_t *pSrc,
2148 SWR_SURFACE_STATE* pDstSurface,
2149 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2150 {
2151 #if USE_8x2_TILE_BACKEND
2152 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2153 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2154 #endif
2155
2156 // Punt non-full tiles to generic store
2157 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2158 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2159
2160 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2161 {
2162 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2163 }
2164
2165 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2166 // We can compute the offsets to each column within the raster tile once and increment from these.
2167 #if USE_8x2_TILE_BACKEND
2168 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2169 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2170 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2171
2172 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2173 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2174
2175 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2176 uint8_t *ppDsts[] =
2177 {
2178 pDst, // row 0, col 0
2179 pDst + DestRowWidthBytes, // row 1, col 0
2180 pDst + DestColumnBytes, // row 0, col 1
2181 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2182 pDst + DestColumnBytes * 2, // row 0, col 2
2183 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2184 pDst + DestColumnBytes * 3, // row 0, col 3
2185 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2186 pDst + DestColumnBytes * 4, // row 0, col 4
2187 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2188 pDst + DestColumnBytes * 5, // row 0, col 5
2189 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2190 pDst + DestColumnBytes * 6, // row 0, col 6
2191 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2192 pDst + DestColumnBytes * 7, // row 0, col 7
2193 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
2194 };
2195
2196 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2197 {
2198 // Raster tile width is same as simd16 tile width
2199 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2200
2201 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2202
2203 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2204
2205 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2206 {
2207 ppDsts[i] += dy;
2208 }
2209 }
2210 #else
2211 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2212 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2213 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2214 struct DstPtrs
2215 {
2216 uint8_t* ppDsts[8];
2217 } ptrs;
2218
2219 // Need 8 pointers, 4 columns of 2 rows each
2220 for (uint32_t y = 0; y < 2; ++y)
2221 {
2222 for (uint32_t x = 0; x < 4; ++x)
2223 {
2224 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2225 }
2226 }
2227
2228 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2229 {
2230 DstPtrs startPtrs = ptrs;
2231
2232 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2233 {
2234 // Format conversion and convert from SOA to AOS, and store the rows.
2235 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2236
2237 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2238 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2239 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2240 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2241 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2242 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2243 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2244 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2245 pSrc += SRC_COLUMN_BYTES;
2246 }
2247
2248 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2249 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2250 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2251 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2252 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2253 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2254 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2255 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2256 }
2257 #endif
2258 }
2259 };
2260
2261 //////////////////////////////////////////////////////////////////////////
2262 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2263 //////////////////////////////////////////////////////////////////////////
2264 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2265 struct StoreMacroTile
2266 {
2267 //////////////////////////////////////////////////////////////////////////
2268 /// @brief Stores a macrotile to the destination surface using safe implementation.
2269 /// @param pSrc - Pointer to macro tile.
2270 /// @param pDstSurface - Destination surface state
2271 /// @param x, y - Coordinates to macro tile
2272 static void StoreGeneric(
2273 uint8_t *pSrcHotTile,
2274 SWR_SURFACE_STATE* pDstSurface,
2275 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2276 {
2277 PFN_STORE_TILES_INTERNAL pfnStore;
2278 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2279
2280 // Store each raster tile from the hot tile to the destination surface.
2281 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2282 {
2283 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2284 {
2285 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2286 {
2287 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2288 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2289 }
2290 }
2291 }
2292
2293 }
2294
2295 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2296 //////////////////////////////////////////////////////////////////////////
2297 /// @brief Stores a macrotile to the destination surface.
2298 /// @param pSrc - Pointer to macro tile.
2299 /// @param pDstSurface - Destination surface state
2300 /// @param x, y - Coordinates to macro tile
2301 static void Store(
2302 uint8_t *pSrcHotTile,
2303 SWR_SURFACE_STATE* pDstSurface,
2304 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2305 {
2306 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2307
2308 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2309 {
2310 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2311 0,
2312 0,
2313 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2314 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2315 sampleNum,
2316 pDstSurface->lod,
2317 pDstSurface);
2318
2319 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2320 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2321 (pDstSurface->bInterleavedSamples);
2322
2323 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2324 }
2325
2326 // Store each raster tile from the hot tile to the destination surface.
2327 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2328 {
2329 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2330 {
2331 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2332 {
2333 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2334 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2335 }
2336 }
2337 }
2338 }
2339 };
2340
2341 //////////////////////////////////////////////////////////////////////////
2342 /// InitStoreTilesTable - Helper for setting up the tables.
2343 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2344 void InitStoreTilesTableColor_Half1(
2345 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2346 {
2347 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2348 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2349 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2350 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2351 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2352 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2353 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2354 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2355 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2356 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2357 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2358 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2359 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2360 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2361 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2362 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2363 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2364 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2365 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2366 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2367 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2368 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2369 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2370 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2371 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2372 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2373 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2374 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2375 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2376 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2377 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2378 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2379 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2380 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2381 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2382 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2383 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2384 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2385 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2386 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2387 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2388 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2389 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2390 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2391 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2392 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2393 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2394 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2395 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2396 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2397 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2398 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2399 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2400 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2401 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2402 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2403 }
2404
2405 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2406 void InitStoreTilesTableColor_Half2(
2407 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2408 {
2409 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2410 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2411 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2412 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2413 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2414 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2415 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2416 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2417 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2418 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2419 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2420 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2421 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2422 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2423 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2424 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2425 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2426 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2427 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2428 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2429 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2430 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2431 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2432 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2433 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2434 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2435 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2436 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2437 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2438 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2439 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2440 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2441 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2442 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2443 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2444 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2445 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2446 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2447 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2448 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2449 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2450 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2451 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2452 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2453 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2454 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2455 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2456 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2457 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2458 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2459 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2460 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2461 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2462 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2463 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2464 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2465 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2466 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2467 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2468 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2469 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2470 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2471 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2472 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2473 }
2474
2475 //////////////////////////////////////////////////////////////////////////
2476 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2477 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2478 void InitStoreTilesTableDepth(
2479 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2480 {
2481 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2482 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2483 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2484 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2485 }
2486
2487 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2488 void InitStoreTilesTableStencil(
2489 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2490 {
2491 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2492 }