swr: [rasterizer core/memory] Move intrinics to _simd functions
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39
40 #include <array>
41 #include <sstream>
42
43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
45
46 //////////////////////////////////////////////////////////////////////////
47 /// Store Raster Tile Function Tables.
48 //////////////////////////////////////////////////////////////////////////
49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52
53 void InitStoreTilesTable_Linear_1();
54 void InitStoreTilesTable_Linear_2();
55 void InitStoreTilesTable_TileX_1();
56 void InitStoreTilesTable_TileX_2();
57 void InitStoreTilesTable_TileY_1();
58 void InitStoreTilesTable_TileY_2();
59 void InitStoreTilesTable_TileW();
60 void InitStoreTilesTable();
61
62 //////////////////////////////////////////////////////////////////////////
63 /// StorePixels
64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
66 /// @param ppDsts - Array of destination pointers. Each pointer is
67 /// to a single row of at most 16B.
68 /// @tparam NumDests - Number of destination pointers. Each pair of
69 /// pointers is for a 16-byte column of two rows.
70 //////////////////////////////////////////////////////////////////////////
71 template <size_t PixelSize, size_t NumDests>
72 struct StorePixels
73 {
74 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
75 };
76
77 //////////////////////////////////////////////////////////////////////////
78 /// StorePixels (32-bit pixel specialization)
79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
81 /// @param ppDsts - Array of destination pointers. Each pointer is
82 /// to a single row of at most 16B.
83 /// @tparam NumDests - Number of destination pointers. Each pair of
84 /// pointers is for a 16-byte column of two rows.
85 //////////////////////////////////////////////////////////////////////////
86 template <>
87 struct StorePixels<8, 2>
88 {
89 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
90 {
91 // Each 4-pixel row is 4 bytes.
92 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
93
94 // Unswizzle from SWR-Z order
95 uint16_t* pRow = (uint16_t*)ppDsts[0];
96 pRow[0] = pPixSrc[0];
97 pRow[1] = pPixSrc[2];
98
99 pRow = (uint16_t*)ppDsts[1];
100 pRow[0] = pPixSrc[1];
101 pRow[1] = pPixSrc[3];
102 }
103 };
104
105 #if USE_8x2_TILE_BACKEND
106 template <>
107 struct StorePixels<8, 4>
108 {
109 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110 {
111 // 8 x 2 bytes = 16 bytes, 16 pixels
112 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113
114 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115
116 // Unswizzle from SWR-Z order
117 ppDsts16[0][0] = pSrc16[0]; // 0 1
118 ppDsts16[0][1] = pSrc16[2]; // 4 5
119
120 ppDsts16[1][0] = pSrc16[1]; // 2 3
121 ppDsts16[1][1] = pSrc16[3]; // 6 7
122
123 ppDsts16[2][0] = pSrc16[4]; // 8 9
124 ppDsts16[2][1] = pSrc16[6]; // C D
125
126 ppDsts16[3][0] = pSrc16[5]; // A B
127 ppDsts16[3][1] = pSrc16[7]; // E F
128 }
129 };
130
131 #endif
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts - Array of destination pointers. Each pointer is
137 /// to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers. Each pair of
139 /// pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
141 template <>
142 struct StorePixels<16, 2>
143 {
144 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145 {
146 // Each 4-pixel row is 8 bytes.
147 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148
149 // Unswizzle from SWR-Z order
150 uint32_t* pRow = (uint32_t*)ppDsts[0];
151 pRow[0] = pPixSrc[0];
152 pRow[1] = pPixSrc[2];
153
154 pRow = (uint32_t*)ppDsts[1];
155 pRow[0] = pPixSrc[1];
156 pRow[1] = pPixSrc[3];
157 }
158 };
159
160 #if USE_8x2_TILE_BACKEND
161 template <>
162 struct StorePixels<16, 4>
163 {
164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165 {
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171 // Unswizzle from SWR-Z order
172 ppDsts32[0][0] = pSrc32[0]; // 0 1
173 ppDsts32[0][1] = pSrc32[2]; // 4 5
174
175 ppDsts32[1][0] = pSrc32[1]; // 2 3
176 ppDsts32[1][1] = pSrc32[3]; // 6 7
177
178 ppDsts32[2][0] = pSrc32[4]; // 8 9
179 ppDsts32[2][1] = pSrc32[6]; // C D
180
181 ppDsts32[3][0] = pSrc32[5]; // A B
182 ppDsts32[3][1] = pSrc32[7]; // E F
183 }
184 };
185
186 #endif
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts - Array of destination pointers. Each pointer is
192 /// to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers. Each pair of
194 /// pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
196 template <>
197 struct StorePixels<32, 2>
198 {
199 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
200 {
201 // Each 4-pixel row is 16-bytes
202 __m128i *pZRow01 = (__m128i*)pSrc;
203 __m128i vQuad00 = _mm_load_si128(pZRow01);
204 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
205
206 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
207 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
208
209 _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
210 _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
211 }
212 };
213
214 #if USE_8x2_TILE_BACKEND
215 template <>
216 struct StorePixels<32, 4>
217 {
218 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219 {
220 // 4 x 16 bytes = 64 bytes, 16 pixels
221 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
222
223 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
224
225 // Unswizzle from SWR-Z order
226 __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
227 __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
228 __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
229 __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
230
231 _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
232 _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
233 _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
234 _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
235 }
236 };
237
238 #endif
239 //////////////////////////////////////////////////////////////////////////
240 /// StorePixels (32-bit pixel specialization)
241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
243 /// @param ppDsts - Array of destination pointers. Each pointer is
244 /// to a single row of at most 16B.
245 /// @tparam NumDests - Number of destination pointers. Each pair of
246 /// pointers is for a 16-byte column of two rows.
247 //////////////////////////////////////////////////////////////////////////
248 template <>
249 struct StorePixels<64, 4>
250 {
251 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
252 {
253 // Each 4-pixel row is 32 bytes.
254 const __m128i* pPixSrc = (const __m128i*)pSrc;
255
256 // order of pointers match SWR-Z layout
257 __m128i** pvDsts = (__m128i**)&ppDsts[0];
258 *pvDsts[0] = pPixSrc[0];
259 *pvDsts[1] = pPixSrc[1];
260 *pvDsts[2] = pPixSrc[2];
261 *pvDsts[3] = pPixSrc[3];
262 }
263 };
264
265 #if USE_8x2_TILE_BACKEND
266 template <>
267 struct StorePixels<64, 8>
268 {
269 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270 {
271 // 8 x 16 bytes = 128 bytes, 16 pixels
272 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
273
274 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
275
276 // order of pointers match SWR-Z layout
277 *ppDsts128[0] = pSrc128[0]; // 0 1
278 *ppDsts128[1] = pSrc128[1]; // 2 3
279 *ppDsts128[2] = pSrc128[2]; // 4 5
280 *ppDsts128[3] = pSrc128[3]; // 6 7
281 *ppDsts128[4] = pSrc128[4]; // 8 9
282 *ppDsts128[5] = pSrc128[5]; // A B
283 *ppDsts128[6] = pSrc128[6]; // C D
284 *ppDsts128[7] = pSrc128[7]; // E F
285 }
286 };
287
288 #endif
289 //////////////////////////////////////////////////////////////////////////
290 /// StorePixels (32-bit pixel specialization)
291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
293 /// @param ppDsts - Array of destination pointers. Each pointer is
294 /// to a single row of at most 16B.
295 /// @tparam NumDests - Number of destination pointers. Each pair of
296 /// pointers is for a 16-byte column of two rows.
297 //////////////////////////////////////////////////////////////////////////
298 template <>
299 struct StorePixels<128, 8>
300 {
301 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
302 {
303 // Each 4-pixel row is 64 bytes.
304 const __m128i* pPixSrc = (const __m128i*)pSrc;
305
306 // Unswizzle from SWR-Z order
307 __m128i** pvDsts = (__m128i**)&ppDsts[0];
308 *pvDsts[0] = pPixSrc[0];
309 *pvDsts[1] = pPixSrc[2];
310 *pvDsts[2] = pPixSrc[1];
311 *pvDsts[3] = pPixSrc[3];
312 *pvDsts[4] = pPixSrc[4];
313 *pvDsts[5] = pPixSrc[6];
314 *pvDsts[6] = pPixSrc[5];
315 *pvDsts[7] = pPixSrc[7];
316 }
317 };
318
319 #if USE_8x2_TILE_BACKEND
320 template <>
321 struct StorePixels<128, 16>
322 {
323 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324 {
325 // 16 x 16 bytes = 256 bytes, 16 pixels
326 const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
327
328 __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
329
330 for (uint32_t i = 0; i < 16; i += 4)
331 {
332 *ppDsts128[i + 0] = pSrc128[i + 0];
333 *ppDsts128[i + 1] = pSrc128[i + 2];
334 *ppDsts128[i + 2] = pSrc128[i + 1];
335 *ppDsts128[i + 3] = pSrc128[i + 3];
336 }
337 }
338 };
339
340 #endif
341 //////////////////////////////////////////////////////////////////////////
342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343 //////////////////////////////////////////////////////////////////////////
344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
345 struct ConvertPixelsSOAtoAOS
346 {
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Converts a SIMD from the Hot Tile to the destination format
349 /// and converts from SOA to AOS.
350 /// @param pSrc - Pointer to raster tile.
351 /// @param pDst - Pointer to destination surface or deswizzling buffer.
352 template <size_t NumDests>
353 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
354 {
355 #if USE_8x2_TILE_BACKEND
356 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357
358 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360
361 // Convert from SrcFormat --> DstFormat
362 simd16vector src;
363 LoadSOA<SrcFormat>(pSrc, src);
364 StoreSOA<DstFormat>(src, soaTile);
365
366 // Convert from SOA --> AOS
367 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368
369 #else
370 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
371
372 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
373 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
374
375 // Convert from SrcFormat --> DstFormat
376 simdvector src;
377 LoadSOA<SrcFormat>(pSrc, src);
378 StoreSOA<DstFormat>(src, soaTile);
379
380 // Convert from SOA --> AOS
381 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
382
383 #endif
384 // Store data into destination
385 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
386 }
387 };
388
389 //////////////////////////////////////////////////////////////////////////
390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391 /// Specialization for no format conversion
392 //////////////////////////////////////////////////////////////////////////
393 template<SWR_FORMAT Format>
394 struct ConvertPixelsSOAtoAOS<Format, Format>
395 {
396 //////////////////////////////////////////////////////////////////////////
397 /// @brief Converts a SIMD from the Hot Tile to the destination format
398 /// and converts from SOA to AOS.
399 /// @param pSrc - Pointer to raster tile.
400 /// @param pDst - Pointer to destination surface or deswizzling buffer.
401 template <size_t NumDests>
402 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
403 {
404 #if USE_8x2_TILE_BACKEND
405 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406
407 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408
409 // Convert from SOA --> AOS
410 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411
412 #else
413 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
414
415 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
416
417 // Convert from SOA --> AOS
418 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
419
420 #endif
421 // Store data into destination
422 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
423 }
424 };
425
426 //////////////////////////////////////////////////////////////////////////
427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428 //////////////////////////////////////////////////////////////////////////
429 template<>
430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
431 {
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Converts a SIMD from the Hot Tile to the destination format
434 /// and converts from SOA to AOS.
435 /// @param pSrc - Pointer to raster tile.
436 /// @param pDst - Pointer to destination surface or deswizzling buffer.
437 template <size_t NumDests>
438 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
439 {
440 #if USE_8x2_TILE_BACKEND
441 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443
444 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445
446 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447
448 // Load hot-tile
449 simd16vector src, dst;
450 LoadSOA<SrcFormat>(pSrc, src);
451
452 // deswizzle
453 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456
457 // clamp
458 dst.x = Clamp<DstFormat>(dst.x, 0);
459 dst.y = Clamp<DstFormat>(dst.y, 1);
460 dst.z = Clamp<DstFormat>(dst.z, 2);
461
462 // normalize
463 dst.x = Normalize<DstFormat>(dst.x, 0);
464 dst.y = Normalize<DstFormat>(dst.y, 1);
465 dst.z = Normalize<DstFormat>(dst.z, 2);
466
467 // pack
468 simd16scalari packed = _simd16_castps_si(dst.x);
469
470 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472
473 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475
476 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477 uint32_t *pPacked = (uint32_t*)&packed;
478 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480 {
481 *pAosTile++ = *pPacked++;
482 }
483
484 #else
485 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
486 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
487 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
488
489 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
490
491 // Load hot-tile
492 simdvector src, dst;
493 LoadSOA<SrcFormat>(pSrc, src);
494
495 // deswizzle
496 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
497 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
498 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
499
500 // clamp
501 dst.x = Clamp<DstFormat>(dst.x, 0);
502 dst.y = Clamp<DstFormat>(dst.y, 1);
503 dst.z = Clamp<DstFormat>(dst.z, 2);
504
505 // normalize
506 dst.x = Normalize<DstFormat>(dst.x, 0);
507 dst.y = Normalize<DstFormat>(dst.y, 1);
508 dst.z = Normalize<DstFormat>(dst.z, 2);
509
510 // pack
511 simdscalari packed = _simd_castps_si(dst.x);
512 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
513 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
514 FormatTraits<DstFormat>::GetConstBPC(1)));
515
516 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517 uint32_t *pPacked = (uint32_t*)&packed;
518 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
519 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
520 {
521 *pAosTile++ = *pPacked++;
522 }
523
524 #endif
525 // Store data into destination
526 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
527 }
528 };
529
530 //////////////////////////////////////////////////////////////////////////
531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532 //////////////////////////////////////////////////////////////////////////
533 template<>
534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
535 {
536 static const SWR_FORMAT SrcFormat = R32_FLOAT;
537 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
538
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Converts a SIMD from the Hot Tile to the destination format
541 /// and converts from SOA to AOS.
542 /// @param pSrc - Pointer to raster tile.
543 /// @param pDst - Pointer to destination surface or deswizzling buffer.
544 template <size_t NumDests>
545 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
546 {
547 #if USE_8x2_TILE_BACKEND
548 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
549
550 // clamp
551 const simd16scalar zero = _simd16_setzero_ps();
552 const simd16scalar ones = _simd16_set1_ps(1.0f);
553
554 comp = _simd16_max_ps(comp, zero);
555 comp = _simd16_min_ps(comp, ones);
556
557 // normalize
558 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
559
560 simd16scalari temp = _simd16_cvtps_epi32(comp);
561
562 // swizzle
563 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
564
565 // merge/store data into destination but don't overwrite the X8 bits
566 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
567 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
568
569 simd16scalari dest = _simd16_setzero_si();
570
571 dest = _simd16_insert_si(dest, destlo, 0);
572 dest = _simd16_insert_si(dest, desthi, 1);
573
574 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
575
576 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
577
578 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
579 _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
580 #else
581 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
582
583 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
584 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
585
586 // Convert from SrcFormat --> DstFormat
587 simdvector src;
588 LoadSOA<SrcFormat>(pSrc, src);
589 StoreSOA<DstFormat>(src, soaTile);
590
591 // Convert from SOA --> AOS
592 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
593
594 // Store data into destination but don't overwrite the X8 bits
595 // Each 4-pixel row is 16-bytes
596 __m128i *pZRow01 = (__m128i*)aosTile;
597 __m128i vQuad00 = _mm_load_si128(pZRow01);
598 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
599
600 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
601 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
602
603 __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
604 __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
605
606 __m128i vMask = _mm_set1_epi32(0xFFFFFF);
607
608 vDst0 = _mm_andnot_si128(vMask, vDst0);
609 vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
610 vDst1 = _mm_andnot_si128(vMask, vDst1);
611 vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
612
613 _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
614 _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
615 #endif
616 }
617 };
618
619 #if USE_8x2_TILE_BACKEND
620 template<SWR_FORMAT DstFormat>
621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622 {
623 // swizzle rgba -> bgra while we load
624 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628
629 // clamp
630 const simd16scalar zero = _simd16_setzero_ps();
631 const simd16scalar ones = _simd16_set1_ps(1.0f);
632
633 comp0 = _simd16_max_ps(comp0, zero);
634 comp0 = _simd16_min_ps(comp0, ones);
635
636 comp1 = _simd16_max_ps(comp1, zero);
637 comp1 = _simd16_min_ps(comp1, ones);
638
639 comp2 = _simd16_max_ps(comp2, zero);
640 comp2 = _simd16_min_ps(comp2, ones);
641
642 comp3 = _simd16_max_ps(comp3, zero);
643 comp3 = _simd16_min_ps(comp3, ones);
644
645 // gamma-correct only rgb
646 if (FormatTraits<DstFormat>::isSRGB)
647 {
648 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651 }
652
653 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658
659 // moving to 16 wide integer vector types
660 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664
665 // SOA to AOS conversion
666 src1 = _simd16_slli_epi32(src1, 8);
667 src2 = _simd16_slli_epi32(src2, 16);
668 src3 = _simd16_slli_epi32(src3, 24);
669
670 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671
672 // de-swizzle conversion
673 #if 1
674 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
676
677 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678
679 #else
680 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681
682 #endif
683 // store 8x2 memory order:
684 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
687 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
688 }
689
690 #endif
691 template<SWR_FORMAT DstFormat>
692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
693 {
694 static const uint32_t offset = sizeof(simdscalar);
695
696 // swizzle rgba -> bgra while we load
697 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
698 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
699 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
700 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
701
702 // clamp
703 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
704 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
705
706 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
707 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
708
709 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
710 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
711
712 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
713 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
714
715 if (FormatTraits<DstFormat>::isSRGB)
716 {
717 // Gamma-correct only rgb
718 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
719 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
720 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
721 }
722
723 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
725 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
726 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
727 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
728
729 // moving to 8 wide integer vector types
730 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
731 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
732 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
733 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
734
735 #if KNOB_ARCH <= KNOB_ARCH_AVX
736
737 // splitting into two sets of 4 wide integer vector types
738 // because AVX doesn't have instructions to support this operation at 8 wide
739 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
740 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
741 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
742 __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
743
744 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
745 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
746 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
747 __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
748
749 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
750 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
751 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
752 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
753 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
754 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
755
756 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
757 srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
758
759 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
760 srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
761
762 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
763 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
764
765 // unpack into rows that get the tiling order correct
766 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
767 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
768
769 simdscalari final = _mm256_castsi128_si256(vRow00);
770 final = _mm256_insertf128_si256(final, vRow10, 1);
771
772 #else
773
774 // logic is as above, only wider
775 src1 = _mm256_slli_si256(src1, 1);
776 src2 = _mm256_slli_si256(src2, 2);
777 src3 = _mm256_slli_si256(src3, 3);
778
779 src0 = _mm256_or_si256(src0, src1);
780 src2 = _mm256_or_si256(src2, src3);
781
782 simdscalari final = _mm256_or_si256(src0, src2);
783
784 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
785 final = _mm256_permute4x64_epi64(final, 0xD8);
786 #endif
787
788 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
789 }
790
791 #if USE_8x2_TILE_BACKEND
792 template<SWR_FORMAT DstFormat>
793 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
794 {
795 // swizzle rgba -> bgra while we load
796 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
797 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
798 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
799
800 // clamp
801 const simd16scalar zero = _simd16_setzero_ps();
802 const simd16scalar ones = _simd16_set1_ps(1.0f);
803
804 comp0 = _simd16_max_ps(comp0, zero);
805 comp0 = _simd16_min_ps(comp0, ones);
806
807 comp1 = _simd16_max_ps(comp1, zero);
808 comp1 = _simd16_min_ps(comp1, ones);
809
810 comp2 = _simd16_max_ps(comp2, zero);
811 comp2 = _simd16_min_ps(comp2, ones);
812
813 // gamma-correct only rgb
814 if (FormatTraits<DstFormat>::isSRGB)
815 {
816 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
817 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
818 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
819 }
820
821 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
822 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
823 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
824 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
825
826 // moving to 16 wide integer vector types
827 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
828 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
829 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
830
831 // SOA to AOS conversion
832 src1 = _simd16_slli_epi32(src1, 8);
833 src2 = _simd16_slli_epi32(src2, 16);
834
835 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
836
837 // de-swizzle conversion
838 #if 1
839 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
840 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
841
842 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
843
844 #else
845 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
846
847 #endif
848 // store 8x2 memory order:
849 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
850 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
851 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
852 _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
853 }
854
855 #endif
856 template<SWR_FORMAT DstFormat>
857 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
858 {
859 static const uint32_t offset = sizeof(simdscalar);
860
861 // swizzle rgba -> bgra while we load
862 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
863 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
864 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
865 // clamp
866 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
867 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
868
869 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
870 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
871
872 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
873 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
874
875 if (FormatTraits<DstFormat>::isSRGB)
876 {
877 // Gamma-correct only rgb
878 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
879 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
880 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
881 }
882
883 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
884 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
885 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
886 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
887
888 // moving to 8 wide integer vector types
889 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
890 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
891 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
892
893 #if KNOB_ARCH <= KNOB_ARCH_AVX
894
895 // splitting into two sets of 4 wide integer vector types
896 // because AVX doesn't have instructions to support this operation at 8 wide
897 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
898 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
899 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
900
901 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
902 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
903 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
904
905 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
906 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
907 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
908 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
909
910 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
911
912 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
913
914 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
915 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
916
917 // unpack into rows that get the tiling order correct
918 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
919 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
920
921 simdscalari final = _mm256_castsi128_si256(vRow00);
922 final = _mm256_insertf128_si256(final, vRow10, 1);
923
924 #else
925
926 // logic is as above, only wider
927 src1 = _mm256_slli_si256(src1, 1);
928 src2 = _mm256_slli_si256(src2, 2);
929
930 src0 = _mm256_or_si256(src0, src1);
931
932 simdscalari final = _mm256_or_si256(src0, src2);
933
934 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
935 final = _mm256_permute4x64_epi64(final, 0xD8);
936
937 #endif
938
939 _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
940 }
941
942 template<>
943 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
944 {
945 template <size_t NumDests>
946 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
947 {
948 #if USE_8x2_TILE_BACKEND
949 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
950 #else
951 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
952 #endif
953 }
954 };
955
956 template<>
957 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
958 {
959 template <size_t NumDests>
960 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
961 {
962 #if USE_8x2_TILE_BACKEND
963 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
964 #else
965 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
966 #endif
967 }
968 };
969
970 template<>
971 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
972 {
973 template <size_t NumDests>
974 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
975 {
976 #if USE_8x2_TILE_BACKEND
977 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
978 #else
979 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
980 #endif
981 }
982 };
983
984 template<>
985 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
986 {
987 template <size_t NumDests>
988 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
989 {
990 #if USE_8x2_TILE_BACKEND
991 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
992 #else
993 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
994 #endif
995 }
996 };
997
998 template<>
999 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1000 {
1001 template <size_t NumDests>
1002 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1003 {
1004 #if USE_8x2_TILE_BACKEND
1005 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1006 #else
1007 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1008 #endif
1009 }
1010 };
1011
1012 template<>
1013 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1014 {
1015 template <size_t NumDests>
1016 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1017 {
1018 #if USE_8x2_TILE_BACKEND
1019 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1020 #else
1021 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1022 #endif
1023 }
1024 };
1025
1026 template<>
1027 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1028 {
1029 template <size_t NumDests>
1030 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1031 {
1032 #if USE_8x2_TILE_BACKEND
1033 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1034 #else
1035 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1036 #endif
1037 }
1038 };
1039
1040 template<>
1041 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1042 {
1043 template <size_t NumDests>
1044 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1045 {
1046 #if USE_8x2_TILE_BACKEND
1047 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1048 #else
1049 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1050 #endif
1051 }
1052 };
1053
1054 //////////////////////////////////////////////////////////////////////////
1055 /// StoreRasterTile
1056 //////////////////////////////////////////////////////////////////////////
1057 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1058 struct StoreRasterTile
1059 {
1060 //////////////////////////////////////////////////////////////////////////
1061 /// @brief Retrieve color from hot tile source which is always float.
1062 /// @param pSrc - Pointer to raster tile.
1063 /// @param x, y - Coordinates to raster tile.
1064 /// @param output - output color
1065 INLINE static void GetSwizzledSrcColor(
1066 uint8_t* pSrc,
1067 uint32_t x, uint32_t y,
1068 float outputColor[4])
1069 {
1070 #if USE_8x2_TILE_BACKEND
1071 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1072
1073 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1074
1075 // Compute which simd tile we're accessing within 8x8 tile.
1076 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1077 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1078
1079 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1080
1081 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1082
1083 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1084 #else
1085 typedef SimdTile<SrcFormat, DstFormat> SimdT;
1086
1087 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1088
1089 // Compute which simd tile we're accessing within 8x8 tile.
1090 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1091 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1092
1093 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1094
1095 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1096
1097 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1098 #endif
1099 }
1100
1101 //////////////////////////////////////////////////////////////////////////
1102 /// @brief Stores an 8x8 raster tile to the destination surface.
1103 /// @param pSrc - Pointer to raster tile.
1104 /// @param pDstSurface - Destination surface state
1105 /// @param x, y - Coordinates to raster tile.
1106 INLINE static void Store(
1107 uint8_t *pSrc,
1108 SWR_SURFACE_STATE* pDstSurface,
1109 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1110 {
1111 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1112 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1113
1114 // For each raster tile pixel (rx, ry)
1115 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1116 {
1117 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1118 {
1119 // Perform bounds checking.
1120 if (((x + rx) < lodWidth) &&
1121 ((y + ry) < lodHeight))
1122 {
1123 float srcColor[4];
1124 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1125
1126 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1127 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1128 sampleNum, pDstSurface->lod, pDstSurface);
1129 {
1130 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1131 }
1132 }
1133 }
1134 }
1135 }
1136 };
1137
1138 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1139 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1140 {};
1141
1142 //////////////////////////////////////////////////////////////////////////
1143 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1144 //////////////////////////////////////////////////////////////////////////
1145 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1146 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1147 {
1148 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1149 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1150 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1151
1152 //////////////////////////////////////////////////////////////////////////
1153 /// @brief Stores an 8x8 raster tile to the destination surface.
1154 /// @param pSrc - Pointer to raster tile.
1155 /// @param pDstSurface - Destination surface state
1156 /// @param x, y - Coordinates to raster tile.
1157 INLINE static void Store(
1158 uint8_t *pSrc,
1159 SWR_SURFACE_STATE* pDstSurface,
1160 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1161 {
1162 // Punt non-full tiles to generic store
1163 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1164 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1165
1166 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1167 {
1168 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1169 }
1170
1171 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1172 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1173 #if USE_8x2_TILE_BACKEND
1174
1175 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1176 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1177
1178 uint8_t* ppDsts[] =
1179 {
1180 pDst, // row 0, col 0
1181 pDst + pDstSurface->pitch, // row 1, col 0
1182 pDst + dx / 2, // row 0, col 1
1183 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1184 };
1185
1186 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1187 {
1188 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1189 {
1190 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1191
1192 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1193
1194 ppDsts[0] += dx;
1195 ppDsts[1] += dx;
1196 ppDsts[2] += dx;
1197 ppDsts[3] += dx;
1198 }
1199
1200 ppDsts[0] += dy;
1201 ppDsts[1] += dy;
1202 ppDsts[2] += dy;
1203 ppDsts[3] += dy;
1204 }
1205 #else
1206 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1207
1208 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1209 {
1210 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1211
1212 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1213 {
1214 // Format conversion and convert from SOA to AOS, and store the rows.
1215 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1216
1217 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1218 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1219 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1220 }
1221
1222 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1223 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1224 }
1225 #endif
1226 }
1227 };
1228
1229 //////////////////////////////////////////////////////////////////////////
1230 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1231 //////////////////////////////////////////////////////////////////////////
1232 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1233 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1234 {
1235 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1236 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1237 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1238
1239 //////////////////////////////////////////////////////////////////////////
1240 /// @brief Stores an 8x8 raster tile to the destination surface.
1241 /// @param pSrc - Pointer to raster tile.
1242 /// @param pDstSurface - Destination surface state
1243 /// @param x, y - Coordinates to raster tile.
1244 INLINE static void Store(
1245 uint8_t *pSrc,
1246 SWR_SURFACE_STATE* pDstSurface,
1247 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1248 {
1249 // Punt non-full tiles to generic store
1250 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1251 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1252
1253 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1254 {
1255 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1256 }
1257
1258 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1259 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1260 #if USE_8x2_TILE_BACKEND
1261
1262 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1263 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1264
1265 uint8_t* ppDsts[] =
1266 {
1267 pDst, // row 0, col 0
1268 pDst + pDstSurface->pitch, // row 1, col 0
1269 pDst + dx / 2, // row 0, col 1
1270 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1271 };
1272
1273 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1274 {
1275 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1276 {
1277 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1278
1279 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1280
1281 ppDsts[0] += dx;
1282 ppDsts[1] += dx;
1283 ppDsts[2] += dx;
1284 ppDsts[3] += dx;
1285 }
1286
1287 ppDsts[0] += dy;
1288 ppDsts[1] += dy;
1289 ppDsts[2] += dy;
1290 ppDsts[3] += dy;
1291 }
1292 #else
1293 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1294
1295 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1296 {
1297 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1298
1299 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1300 {
1301 // Format conversion and convert from SOA to AOS, and store the rows.
1302 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1303
1304 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1305 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1306 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1307 }
1308
1309 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1310 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1311 }
1312 #endif
1313 }
1314 };
1315
1316 //////////////////////////////////////////////////////////////////////////
1317 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1318 //////////////////////////////////////////////////////////////////////////
1319 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1320 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1321 {
1322 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1323 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1324 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1325
1326 //////////////////////////////////////////////////////////////////////////
1327 /// @brief Stores an 8x8 raster tile to the destination surface.
1328 /// @param pSrc - Pointer to raster tile.
1329 /// @param pDstSurface - Destination surface state
1330 /// @param x, y - Coordinates to raster tile.
1331 INLINE static void Store(
1332 uint8_t *pSrc,
1333 SWR_SURFACE_STATE* pDstSurface,
1334 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1335 {
1336 // Punt non-full tiles to generic store
1337 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1338 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1339
1340 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1341 {
1342 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1343 }
1344
1345 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1346 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1347 #if USE_8x2_TILE_BACKEND
1348
1349 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1350 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1351
1352 uint8_t* ppDsts[] =
1353 {
1354 pDst, // row 0, col 0
1355 pDst + pDstSurface->pitch, // row 1, col 0
1356 pDst + dx / 2, // row 0, col 1
1357 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1358 };
1359
1360 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1361 {
1362 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1363 {
1364 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1365
1366 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1367
1368 ppDsts[0] += dx;
1369 ppDsts[1] += dx;
1370 ppDsts[2] += dx;
1371 ppDsts[3] += dx;
1372 }
1373
1374 ppDsts[0] += dy;
1375 ppDsts[1] += dy;
1376 ppDsts[2] += dy;
1377 ppDsts[3] += dy;
1378 }
1379 #else
1380 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1381
1382 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1383 {
1384 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1385
1386 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1387 {
1388 // Format conversion and convert from SOA to AOS, and store the rows.
1389 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1390
1391 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1392 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1393 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1394 }
1395
1396 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1397 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1398 }
1399 #endif
1400 }
1401 };
1402
1403 //////////////////////////////////////////////////////////////////////////
1404 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1405 //////////////////////////////////////////////////////////////////////////
1406 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1407 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1408 {
1409 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1410 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1411 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1412 static const size_t MAX_DST_COLUMN_BYTES = 16;
1413 #if !USE_8x2_TILE_BACKEND
1414 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1415 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1416 #endif
1417
1418 //////////////////////////////////////////////////////////////////////////
1419 /// @brief Stores an 8x8 raster tile to the destination surface.
1420 /// @param pSrc - Pointer to raster tile.
1421 /// @param pDstSurface - Destination surface state
1422 /// @param x, y - Coordinates to raster tile.
1423 INLINE static void Store(
1424 uint8_t *pSrc,
1425 SWR_SURFACE_STATE* pDstSurface,
1426 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1427 {
1428 // Punt non-full tiles to generic store
1429 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1430 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1431
1432 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1433 {
1434 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1435 }
1436
1437 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1438 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1439 #if USE_8x2_TILE_BACKEND
1440
1441 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1442 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1443
1444 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1445 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1446
1447 uint8_t *ppDsts[] =
1448 {
1449 pDst, // row 0, col 0
1450 pDst + pDstSurface->pitch, // row 1, col 0
1451 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1452 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1453 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1454 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1455 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1456 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1457 };
1458
1459 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1460 {
1461 // Raster tile width is same as simd16 tile width
1462 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1463
1464 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1465
1466 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1467
1468 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1469 {
1470 ppDsts[i] += dy;
1471 }
1472 }
1473 #else
1474 uint8_t* ppDsts[] =
1475 {
1476 pDst, // row 0, col 0
1477 pDst + pDstSurface->pitch, // row 1, col 0
1478 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1479 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1480 };
1481
1482 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1483 {
1484 uint8_t* ppStartRows[] =
1485 {
1486 ppDsts[0],
1487 ppDsts[1],
1488 ppDsts[2],
1489 ppDsts[3],
1490 };
1491
1492 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1493 {
1494 // Format conversion and convert from SOA to AOS, and store the rows.
1495 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1496
1497 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1498 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1499 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1500 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1501 pSrc += SRC_COLUMN_BYTES;
1502 }
1503
1504 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1505 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1506 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1507 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1508 }
1509 #endif
1510 }
1511 };
1512
1513 //////////////////////////////////////////////////////////////////////////
1514 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1515 //////////////////////////////////////////////////////////////////////////
1516 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1517 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1518 {
1519 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1520 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1521 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1522 static const size_t MAX_DST_COLUMN_BYTES = 16;
1523 #if !USE_8x2_TILE_BACKEND
1524 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1525 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1526 #endif
1527
1528 //////////////////////////////////////////////////////////////////////////
1529 /// @brief Stores an 8x8 raster tile to the destination surface.
1530 /// @param pSrc - Pointer to raster tile.
1531 /// @param pDstSurface - Destination surface state
1532 /// @param x, y - Coordinates to raster tile.
1533 INLINE static void Store(
1534 uint8_t *pSrc,
1535 SWR_SURFACE_STATE* pDstSurface,
1536 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1537 {
1538 // Punt non-full tiles to generic store
1539 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1540 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1541
1542 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1543 {
1544 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1545 }
1546
1547 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1548 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1549 #if USE_8x2_TILE_BACKEND
1550
1551 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1552 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1553
1554 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1555 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1556
1557 uint8_t* ppDsts[] =
1558 {
1559 pDst, // row 0, col 0
1560 pDst + pDstSurface->pitch, // row 1, col 0
1561 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1562 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1563 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1564 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1565 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1566 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1567 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1568 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1569 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1570 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1571 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1572 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1573 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1574 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1575 };
1576
1577 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1578 {
1579 // Raster tile width is same as simd16 tile width
1580 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1581
1582 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1583
1584 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1585
1586 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1587 {
1588 ppDsts[i] += dy;
1589 }
1590 }
1591 #else
1592 struct DstPtrs
1593 {
1594 uint8_t* ppDsts[8];
1595 } ptrs;
1596
1597 // Need 8 pointers, 4 columns of 2 rows each
1598 for (uint32_t y = 0; y < 2; ++y)
1599 {
1600 for (uint32_t x = 0; x < 4; ++x)
1601 {
1602 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1603 }
1604 }
1605
1606 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1607 {
1608 DstPtrs startPtrs = ptrs;
1609
1610 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1611 {
1612 // Format conversion and convert from SOA to AOS, and store the rows.
1613 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1614
1615 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1616 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1617 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1618 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1619 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1620 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1621 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1622 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1623 pSrc += SRC_COLUMN_BYTES;
1624 }
1625
1626 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1627 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1628 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1629 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1630 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1631 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1632 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1633 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1634 }
1635 #endif
1636 }
1637 };
1638
1639 //////////////////////////////////////////////////////////////////////////
1640 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1641 //////////////////////////////////////////////////////////////////////////
1642 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1643 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1644 {
1645 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1646 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1647
1648 //////////////////////////////////////////////////////////////////////////
1649 /// @brief Stores an 8x8 raster tile to the destination surface.
1650 /// @param pSrc - Pointer to raster tile.
1651 /// @param pDstSurface - Destination surface state
1652 /// @param x, y - Coordinates to raster tile.
1653 INLINE static void Store(
1654 uint8_t *pSrc,
1655 SWR_SURFACE_STATE* pDstSurface,
1656 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1657 {
1658 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1659
1660 // Punt non-full tiles to generic store
1661 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1662 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1663
1664 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1665 {
1666 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1667 }
1668
1669 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1670 // We can compute the offsets to each column within the raster tile once and increment from these.
1671 #if USE_8x2_TILE_BACKEND
1672 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1673 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1674 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1675
1676 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1677
1678 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1679 uint8_t *ppDsts[] =
1680 {
1681 pDst,
1682 pDst + DestRowWidthBytes,
1683 pDst + DestRowWidthBytes / 4,
1684 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1685 };
1686
1687 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1688 {
1689 // Raster tile width is same as simd16 tile width
1690 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1691
1692 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1693
1694 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1695
1696 ppDsts[0] += dy;
1697 ppDsts[1] += dy;
1698 ppDsts[2] += dy;
1699 ppDsts[3] += dy;
1700 }
1701 #else
1702 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1703 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1704 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1705
1706 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1707 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1708
1709 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1710 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1711 {
1712 uint32_t rowOffset = row * DestRowWidthBytes;
1713
1714 uint8_t* pRow = pCol0 + rowOffset;
1715 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1716
1717 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1718 pSrc += pSrcInc;
1719
1720 ppDsts[0] += DestRowWidthBytes / 4;
1721 ppDsts[1] += DestRowWidthBytes / 4;
1722
1723 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1724 pSrc += pSrcInc;
1725 }
1726 #endif
1727 }
1728 };
1729
1730 //////////////////////////////////////////////////////////////////////////
1731 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1732 //////////////////////////////////////////////////////////////////////////
1733 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1734 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1735 {
1736 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1737 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1738
1739 //////////////////////////////////////////////////////////////////////////
1740 /// @brief Stores an 8x8 raster tile to the destination surface.
1741 /// @param pSrc - Pointer to raster tile.
1742 /// @param pDstSurface - Destination surface state
1743 /// @param x, y - Coordinates to raster tile.
1744 INLINE static void Store(
1745 uint8_t *pSrc,
1746 SWR_SURFACE_STATE* pDstSurface,
1747 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1748 {
1749 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1750
1751 // Punt non-full tiles to generic store
1752 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1753 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1754
1755 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1756 {
1757 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1758 }
1759
1760 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1761 // We can compute the offsets to each column within the raster tile once and increment from these.
1762 #if USE_8x2_TILE_BACKEND
1763 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1764 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1765 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1766
1767 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1768
1769 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1770 uint8_t *ppDsts[] =
1771 {
1772 pDst,
1773 pDst + DestRowWidthBytes,
1774 pDst + DestRowWidthBytes / 2,
1775 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1776 };
1777
1778 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1779 {
1780 // Raster tile width is same as simd16 tile width
1781 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1782
1783 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1784
1785 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1786
1787 ppDsts[0] += dy;
1788 ppDsts[1] += dy;
1789 ppDsts[2] += dy;
1790 ppDsts[3] += dy;
1791 }
1792 #else
1793 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1794 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1795 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1796
1797 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1798 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1799
1800 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1801 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1802 {
1803 uint32_t rowOffset = row * DestRowWidthBytes;
1804
1805 uint8_t* pRow = pCol0 + rowOffset;
1806 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1807
1808 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1809 pSrc += pSrcInc;
1810
1811 ppDsts[0] += DestRowWidthBytes / 2;
1812 ppDsts[1] += DestRowWidthBytes / 2;
1813
1814 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1815 pSrc += pSrcInc;
1816 }
1817 #endif
1818 }
1819 };
1820
1821 //////////////////////////////////////////////////////////////////////////
1822 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1823 //////////////////////////////////////////////////////////////////////////
1824 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1825 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1826 {
1827 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1828 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1829 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1830
1831 //////////////////////////////////////////////////////////////////////////
1832 /// @brief Stores an 8x8 raster tile to the destination surface.
1833 /// @param pSrc - Pointer to raster tile.
1834 /// @param pDstSurface - Destination surface state
1835 /// @param x, y - Coordinates to raster tile.
1836 INLINE static void Store(
1837 uint8_t *pSrc,
1838 SWR_SURFACE_STATE* pDstSurface,
1839 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1840 {
1841 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1842
1843 // Punt non-full tiles to generic store
1844 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1845 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1846
1847 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1848 {
1849 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1850 }
1851
1852 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1853 // We can compute the offsets to each column within the raster tile once and increment from these.
1854 #if USE_8x2_TILE_BACKEND
1855 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1856 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1857
1858 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1859 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1860
1861 uint8_t* ppDsts[] =
1862 {
1863 pDst, // row 0, col 0
1864 pDst + DestRowWidthBytes, // row 1, col 0
1865 pDst + dx / 2, // row 0, col 1
1866 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1867 };
1868
1869 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1870 {
1871 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1872 {
1873 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1874
1875 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1876
1877 ppDsts[0] += dx;
1878 ppDsts[1] += dx;
1879 ppDsts[2] += dx;
1880 ppDsts[3] += dx;
1881 }
1882
1883 ppDsts[0] += dy;
1884 ppDsts[1] += dy;
1885 ppDsts[2] += dy;
1886 ppDsts[3] += dy;
1887 }
1888 #else
1889 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1890 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1891 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1892
1893 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1894 {
1895 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1896 {
1897 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1898
1899 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1900 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1901
1902 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1903 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1904 }
1905
1906 pRow0 += (DestRowWidthBytes * 2);
1907 pRow1 += (DestRowWidthBytes * 2);
1908 }
1909 #endif
1910 }
1911 };
1912
1913 //////////////////////////////////////////////////////////////////////////
1914 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1915 //////////////////////////////////////////////////////////////////////////
1916 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1917 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1918 {
1919 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1920 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1921
1922 //////////////////////////////////////////////////////////////////////////
1923 /// @brief Stores an 8x8 raster tile to the destination surface.
1924 /// @param pSrc - Pointer to raster tile.
1925 /// @param pDstSurface - Destination surface state
1926 /// @param x, y - Coordinates to raster tile.
1927 INLINE static void Store(
1928 uint8_t *pSrc,
1929 SWR_SURFACE_STATE* pDstSurface,
1930 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1931 {
1932 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1933 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1934
1935 // Punt non-full tiles to generic store
1936 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1937 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1938
1939 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1940 {
1941 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1942 }
1943
1944 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1945 // We can compute the offsets to each column within the raster tile once and increment from these.
1946 #if USE_8x2_TILE_BACKEND
1947 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1948 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1949 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1950
1951 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1952 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1953
1954 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1955 uint8_t *ppDsts[] =
1956 {
1957 pDst, // row 0, col 0
1958 pDst + DestRowWidthBytes, // row 1, col 0
1959 pDst + DestColumnBytes, // row 0, col 1
1960 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
1961 };
1962
1963 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1964 {
1965 // Raster tile width is same as simd16 tile width
1966 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1967
1968 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1969
1970 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1971
1972 ppDsts[0] += dy;
1973 ppDsts[1] += dy;
1974 ppDsts[2] += dy;
1975 ppDsts[3] += dy;
1976 }
1977 #else
1978 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1979 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1980 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1981
1982 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1983 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1984
1985 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1986 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1987 {
1988 uint32_t rowOffset = row * DestRowWidthBytes;
1989
1990 uint8_t* pRow = pCol0 + rowOffset;
1991 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1992
1993 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1994 pSrc += pSrcInc;
1995
1996 ppDsts[0] += DestColumnBytes;
1997 ppDsts[1] += DestColumnBytes;
1998
1999 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2000 pSrc += pSrcInc;
2001 }
2002 #endif
2003 }
2004 };
2005
2006 //////////////////////////////////////////////////////////////////////////
2007 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2008 //////////////////////////////////////////////////////////////////////////
2009 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2010 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2011 {
2012 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2013 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2014
2015 //////////////////////////////////////////////////////////////////////////
2016 /// @brief Stores an 8x8 raster tile to the destination surface.
2017 /// @param pSrc - Pointer to raster tile.
2018 /// @param pDstSurface - Destination surface state
2019 /// @param x, y - Coordinates to raster tile.
2020 INLINE static void Store(
2021 uint8_t *pSrc,
2022 SWR_SURFACE_STATE* pDstSurface,
2023 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2024 {
2025 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2026 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2027
2028 // Punt non-full tiles to generic store
2029 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2030 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2031
2032 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2033 {
2034 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2035 }
2036
2037 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2038 // We can compute the offsets to each column within the raster tile once and increment from these.
2039 #if USE_8x2_TILE_BACKEND
2040 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2041 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2042 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2043
2044 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2045 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2046
2047 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2048 uint8_t *ppDsts[] =
2049 {
2050 pDst, // row 0, col 0
2051 pDst + DestRowWidthBytes, // row 1, col 0
2052 pDst + DestColumnBytes, // row 0, col 1
2053 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2054 pDst + DestColumnBytes * 2, // row 0, col 2
2055 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2056 pDst + DestColumnBytes * 3, // row 0, col 3
2057 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
2058 };
2059
2060 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2061 {
2062 // Raster tile width is same as simd16 tile width
2063 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2064
2065 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2066
2067 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2068
2069 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2070 {
2071 ppDsts[i] += dy;
2072 }
2073 }
2074 #else
2075 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2076 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2077 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2078 uint8_t* pCol1 = pCol0 + DestColumnBytes;
2079
2080 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2081 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2082 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2083
2084 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2085 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2086 {
2087 uint32_t rowOffset = row * DestRowWidthBytes;
2088 uint8_t* ppDsts[] =
2089 {
2090 pCol0 + rowOffset,
2091 pCol0 + rowOffset + DestRowWidthBytes,
2092 pCol1 + rowOffset,
2093 pCol1 + rowOffset + DestRowWidthBytes,
2094 };
2095
2096 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2097 pSrc += pSrcInc;
2098
2099 ppDsts[0] += DestColumnBytes * 2;
2100 ppDsts[1] += DestColumnBytes * 2;
2101 ppDsts[2] += DestColumnBytes * 2;
2102 ppDsts[3] += DestColumnBytes * 2;
2103
2104 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2105 pSrc += pSrcInc;
2106 }
2107 #endif
2108 }
2109 };
2110
2111 //////////////////////////////////////////////////////////////////////////
2112 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2113 //////////////////////////////////////////////////////////////////////////
2114 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2115 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2116 {
2117 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2118 #if USE_8x2_TILE_BACKEND
2119 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2120
2121 #else
2122 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2123 static const size_t TILE_Y_ROWS = 32;
2124 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2125
2126 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2127 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2128 static const size_t MAX_DST_COLUMN_BYTES = 16;
2129
2130 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2131 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2132
2133 #endif
2134 //////////////////////////////////////////////////////////////////////////
2135 /// @brief Stores an 8x8 raster tile to the destination surface.
2136 /// @param pSrc - Pointer to raster tile.
2137 /// @param pDstSurface - Destination surface state
2138 /// @param x, y - Coordinates to raster tile.
2139 INLINE static void Store(
2140 uint8_t *pSrc,
2141 SWR_SURFACE_STATE* pDstSurface,
2142 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2143 {
2144 #if USE_8x2_TILE_BACKEND
2145 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2146 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2147 #endif
2148
2149 // Punt non-full tiles to generic store
2150 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2151 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2152
2153 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2154 {
2155 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2156 }
2157
2158 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2159 // We can compute the offsets to each column within the raster tile once and increment from these.
2160 #if USE_8x2_TILE_BACKEND
2161 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2162 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2163 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2164
2165 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2166 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2167
2168 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2169 uint8_t *ppDsts[] =
2170 {
2171 pDst, // row 0, col 0
2172 pDst + DestRowWidthBytes, // row 1, col 0
2173 pDst + DestColumnBytes, // row 0, col 1
2174 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2175 pDst + DestColumnBytes * 2, // row 0, col 2
2176 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2177 pDst + DestColumnBytes * 3, // row 0, col 3
2178 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2179 pDst + DestColumnBytes * 4, // row 0, col 4
2180 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2181 pDst + DestColumnBytes * 5, // row 0, col 5
2182 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2183 pDst + DestColumnBytes * 6, // row 0, col 6
2184 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2185 pDst + DestColumnBytes * 7, // row 0, col 7
2186 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
2187 };
2188
2189 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2190 {
2191 // Raster tile width is same as simd16 tile width
2192 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2193
2194 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2195
2196 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2197
2198 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2199 {
2200 ppDsts[i] += dy;
2201 }
2202 }
2203 #else
2204 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2205 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2206 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2207 struct DstPtrs
2208 {
2209 uint8_t* ppDsts[8];
2210 } ptrs;
2211
2212 // Need 8 pointers, 4 columns of 2 rows each
2213 for (uint32_t y = 0; y < 2; ++y)
2214 {
2215 for (uint32_t x = 0; x < 4; ++x)
2216 {
2217 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2218 }
2219 }
2220
2221 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2222 {
2223 DstPtrs startPtrs = ptrs;
2224
2225 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2226 {
2227 // Format conversion and convert from SOA to AOS, and store the rows.
2228 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2229
2230 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2231 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2232 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2233 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2234 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2235 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2236 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2237 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2238 pSrc += SRC_COLUMN_BYTES;
2239 }
2240
2241 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2242 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2243 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2244 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2245 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2246 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2247 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2248 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2249 }
2250 #endif
2251 }
2252 };
2253
2254 //////////////////////////////////////////////////////////////////////////
2255 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2256 //////////////////////////////////////////////////////////////////////////
2257 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2258 struct StoreMacroTile
2259 {
2260 //////////////////////////////////////////////////////////////////////////
2261 /// @brief Stores a macrotile to the destination surface using safe implementation.
2262 /// @param pSrc - Pointer to macro tile.
2263 /// @param pDstSurface - Destination surface state
2264 /// @param x, y - Coordinates to macro tile
2265 static void StoreGeneric(
2266 uint8_t *pSrcHotTile,
2267 SWR_SURFACE_STATE* pDstSurface,
2268 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2269 {
2270 PFN_STORE_TILES_INTERNAL pfnStore;
2271 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2272
2273 // Store each raster tile from the hot tile to the destination surface.
2274 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2275 {
2276 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2277 {
2278 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2279 {
2280 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2281 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2282 }
2283 }
2284 }
2285
2286 }
2287
2288 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2289 //////////////////////////////////////////////////////////////////////////
2290 /// @brief Stores a macrotile to the destination surface.
2291 /// @param pSrc - Pointer to macro tile.
2292 /// @param pDstSurface - Destination surface state
2293 /// @param x, y - Coordinates to macro tile
2294 static void Store(
2295 uint8_t *pSrcHotTile,
2296 SWR_SURFACE_STATE* pDstSurface,
2297 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2298 {
2299 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2300
2301 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2302 {
2303 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2304 0,
2305 0,
2306 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2307 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2308 sampleNum,
2309 pDstSurface->lod,
2310 pDstSurface);
2311
2312 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2313 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2314 (pDstSurface->bInterleavedSamples);
2315
2316 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2317 }
2318
2319 // Store each raster tile from the hot tile to the destination surface.
2320 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2321 {
2322 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2323 {
2324 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2325 {
2326 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2327 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2328 }
2329 }
2330 }
2331 }
2332 };
2333
2334 //////////////////////////////////////////////////////////////////////////
2335 /// InitStoreTilesTable - Helper for setting up the tables.
2336 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2337 void InitStoreTilesTableColor_Half1(
2338 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2339 {
2340 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2341 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2342 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2343 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2344 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2345 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2346 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2347 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2348 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2349 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2350 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2351 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2352 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2353 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2354 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2355 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2356 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2357 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2358 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2359 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2360 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2361 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2362 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2363 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2364 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2365 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2366 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2367 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2368 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2369 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2370 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2371 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2372 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2373 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2374 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2375 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2376 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2377 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2378 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2379 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2380 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2381 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2382 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2383 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2384 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2385 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2386 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2387 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2388 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2389 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2390 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2391 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2392 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2393 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2394 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2395 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2396 }
2397
2398 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2399 void InitStoreTilesTableColor_Half2(
2400 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2401 {
2402 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2403 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2404 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2405 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2406 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2407 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2408 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2409 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2410 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2411 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2412 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2413 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2414 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2415 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2416 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2417 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2418 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2419 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2420 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2421 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2422 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2423 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2424 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2425 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2426 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2427 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2428 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2429 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2430 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2431 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2432 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2433 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2434 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2435 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2436 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2437 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2438 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2439 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2440 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2441 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2442 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2443 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2444 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2445 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2446 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2447 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2448 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2449 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2450 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2451 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2452 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2453 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2454 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2455 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2456 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2457 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2458 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2459 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2460 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2461 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2462 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2463 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2464 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2465 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2466 }
2467
2468 //////////////////////////////////////////////////////////////////////////
2469 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2470 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2471 void InitStoreTilesTableDepth(
2472 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2473 {
2474 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2475 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2476 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2477 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2478 }
2479
2480 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2481 void InitStoreTilesTableStencil(
2482 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2483 {
2484 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2485 }