512c33802702bddc9079823dada5e9cf264040b1
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "core/multisample.h"
39
40 #include <array>
41 #include <sstream>
42
43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
45
46 //////////////////////////////////////////////////////////////////////////
47 /// Store Raster Tile Function Tables.
48 //////////////////////////////////////////////////////////////////////////
49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52
53 void InitStoreTilesTable_Linear_1();
54 void InitStoreTilesTable_Linear_2();
55 void InitStoreTilesTable_TileX_1();
56 void InitStoreTilesTable_TileX_2();
57 void InitStoreTilesTable_TileY_1();
58 void InitStoreTilesTable_TileY_2();
59 void InitStoreTilesTable_TileW();
60 void InitStoreTilesTable();
61
62 //////////////////////////////////////////////////////////////////////////
63 /// StorePixels
64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
66 /// @param ppDsts - Array of destination pointers. Each pointer is
67 /// to a single row of at most 16B.
68 /// @tparam NumDests - Number of destination pointers. Each pair of
69 /// pointers is for a 16-byte column of two rows.
70 //////////////////////////////////////////////////////////////////////////
71 template <size_t PixelSize, size_t NumDests>
72 struct StorePixels
73 {
74 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
75 };
76
77 //////////////////////////////////////////////////////////////////////////
78 /// StorePixels (32-bit pixel specialization)
79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
81 /// @param ppDsts - Array of destination pointers. Each pointer is
82 /// to a single row of at most 16B.
83 /// @tparam NumDests - Number of destination pointers. Each pair of
84 /// pointers is for a 16-byte column of two rows.
85 //////////////////////////////////////////////////////////////////////////
86 template <>
87 struct StorePixels<8, 2>
88 {
89 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
90 {
91 // Each 4-pixel row is 4 bytes.
92 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
93
94 // Unswizzle from SWR-Z order
95 uint16_t* pRow = (uint16_t*)ppDsts[0];
96 pRow[0] = pPixSrc[0];
97 pRow[1] = pPixSrc[2];
98
99 pRow = (uint16_t*)ppDsts[1];
100 pRow[0] = pPixSrc[1];
101 pRow[1] = pPixSrc[3];
102 }
103 };
104
105 #if USE_8x2_TILE_BACKEND
106 template <>
107 struct StorePixels<8, 4>
108 {
109 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110 {
111 // 8 x 2 bytes = 16 bytes, 16 pixels
112 const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113
114 uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115
116 // Unswizzle from SWR-Z order
117 ppDsts16[0][0] = pSrc16[0]; // 0 1
118 ppDsts16[0][1] = pSrc16[2]; // 4 5
119
120 ppDsts16[1][0] = pSrc16[1]; // 2 3
121 ppDsts16[1][1] = pSrc16[3]; // 6 7
122
123 ppDsts16[2][0] = pSrc16[4]; // 8 9
124 ppDsts16[2][1] = pSrc16[6]; // C D
125
126 ppDsts16[3][0] = pSrc16[5]; // A B
127 ppDsts16[3][1] = pSrc16[7]; // E F
128 }
129 };
130
131 #endif
132 //////////////////////////////////////////////////////////////////////////
133 /// StorePixels (32-bit pixel specialization)
134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
136 /// @param ppDsts - Array of destination pointers. Each pointer is
137 /// to a single row of at most 16B.
138 /// @tparam NumDests - Number of destination pointers. Each pair of
139 /// pointers is for a 16-byte column of two rows.
140 //////////////////////////////////////////////////////////////////////////
141 template <>
142 struct StorePixels<16, 2>
143 {
144 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145 {
146 // Each 4-pixel row is 8 bytes.
147 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148
149 // Unswizzle from SWR-Z order
150 uint32_t* pRow = (uint32_t*)ppDsts[0];
151 pRow[0] = pPixSrc[0];
152 pRow[1] = pPixSrc[2];
153
154 pRow = (uint32_t*)ppDsts[1];
155 pRow[0] = pPixSrc[1];
156 pRow[1] = pPixSrc[3];
157 }
158 };
159
160 #if USE_8x2_TILE_BACKEND
161 template <>
162 struct StorePixels<16, 4>
163 {
164 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165 {
166 // 8 x 4 bytes = 32 bytes, 16 pixels
167 const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169 uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171 // Unswizzle from SWR-Z order
172 ppDsts32[0][0] = pSrc32[0]; // 0 1
173 ppDsts32[0][1] = pSrc32[2]; // 4 5
174
175 ppDsts32[1][0] = pSrc32[1]; // 2 3
176 ppDsts32[1][1] = pSrc32[3]; // 6 7
177
178 ppDsts32[2][0] = pSrc32[4]; // 8 9
179 ppDsts32[2][1] = pSrc32[6]; // C D
180
181 ppDsts32[3][0] = pSrc32[5]; // A B
182 ppDsts32[3][1] = pSrc32[7]; // E F
183 }
184 };
185
186 #endif
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts - Array of destination pointers. Each pointer is
192 /// to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers. Each pair of
194 /// pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
196 template <>
197 struct StorePixels<32, 2>
198 {
199 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
200 {
201 // Each 4-pixel row is 16-bytes
202 simd4scalari *pZRow01 = (simd4scalari*)pSrc;
203 simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
204 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
205
206 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
207 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
208
209 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
210 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
211 }
212 };
213
214 #if USE_8x2_TILE_BACKEND
215 template <>
216 struct StorePixels<32, 4>
217 {
218 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219 {
220 // 4 x 16 bytes = 64 bytes, 16 pixels
221 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
222
223 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
224
225 // Unswizzle from SWR-Z order
226 simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
227 simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
228 simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
229 simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
230
231 SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
232 SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
233 SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
234 SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
235 }
236 };
237
238 #endif
239 //////////////////////////////////////////////////////////////////////////
240 /// StorePixels (32-bit pixel specialization)
241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
243 /// @param ppDsts - Array of destination pointers. Each pointer is
244 /// to a single row of at most 16B.
245 /// @tparam NumDests - Number of destination pointers. Each pair of
246 /// pointers is for a 16-byte column of two rows.
247 //////////////////////////////////////////////////////////////////////////
248 template <>
249 struct StorePixels<64, 4>
250 {
251 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
252 {
253 // Each 4-pixel row is 32 bytes.
254 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
255
256 // order of pointers match SWR-Z layout
257 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
258 *pvDsts[0] = pPixSrc[0];
259 *pvDsts[1] = pPixSrc[1];
260 *pvDsts[2] = pPixSrc[2];
261 *pvDsts[3] = pPixSrc[3];
262 }
263 };
264
265 #if USE_8x2_TILE_BACKEND
266 template <>
267 struct StorePixels<64, 8>
268 {
269 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270 {
271 // 8 x 16 bytes = 128 bytes, 16 pixels
272 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
273
274 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
275
276 // order of pointers match SWR-Z layout
277 *ppDsts128[0] = pSrc128[0]; // 0 1
278 *ppDsts128[1] = pSrc128[1]; // 2 3
279 *ppDsts128[2] = pSrc128[2]; // 4 5
280 *ppDsts128[3] = pSrc128[3]; // 6 7
281 *ppDsts128[4] = pSrc128[4]; // 8 9
282 *ppDsts128[5] = pSrc128[5]; // A B
283 *ppDsts128[6] = pSrc128[6]; // C D
284 *ppDsts128[7] = pSrc128[7]; // E F
285 }
286 };
287
288 #endif
289 //////////////////////////////////////////////////////////////////////////
290 /// StorePixels (32-bit pixel specialization)
291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
293 /// @param ppDsts - Array of destination pointers. Each pointer is
294 /// to a single row of at most 16B.
295 /// @tparam NumDests - Number of destination pointers. Each pair of
296 /// pointers is for a 16-byte column of two rows.
297 //////////////////////////////////////////////////////////////////////////
298 template <>
299 struct StorePixels<128, 8>
300 {
301 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
302 {
303 // Each 4-pixel row is 64 bytes.
304 const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
305
306 // Unswizzle from SWR-Z order
307 simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
308 *pvDsts[0] = pPixSrc[0];
309 *pvDsts[1] = pPixSrc[2];
310 *pvDsts[2] = pPixSrc[1];
311 *pvDsts[3] = pPixSrc[3];
312 *pvDsts[4] = pPixSrc[4];
313 *pvDsts[5] = pPixSrc[6];
314 *pvDsts[6] = pPixSrc[5];
315 *pvDsts[7] = pPixSrc[7];
316 }
317 };
318
319 #if USE_8x2_TILE_BACKEND
320 template <>
321 struct StorePixels<128, 16>
322 {
323 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324 {
325 // 16 x 16 bytes = 256 bytes, 16 pixels
326 const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
327
328 simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
329
330 for (uint32_t i = 0; i < 16; i += 4)
331 {
332 *ppDsts128[i + 0] = pSrc128[i + 0];
333 *ppDsts128[i + 1] = pSrc128[i + 2];
334 *ppDsts128[i + 2] = pSrc128[i + 1];
335 *ppDsts128[i + 3] = pSrc128[i + 3];
336 }
337 }
338 };
339
340 #endif
341 //////////////////////////////////////////////////////////////////////////
342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343 //////////////////////////////////////////////////////////////////////////
344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
345 struct ConvertPixelsSOAtoAOS
346 {
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Converts a SIMD from the Hot Tile to the destination format
349 /// and converts from SOA to AOS.
350 /// @param pSrc - Pointer to raster tile.
351 /// @param pDst - Pointer to destination surface or deswizzling buffer.
352 template <size_t NumDests>
353 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
354 {
355 #if USE_8x2_TILE_BACKEND
356 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357
358 OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360
361 // Convert from SrcFormat --> DstFormat
362 simd16vector src;
363 LoadSOA<SrcFormat>(pSrc, src);
364 StoreSOA<DstFormat>(src, soaTile);
365
366 // Convert from SOA --> AOS
367 FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368
369 #else
370 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
371
372 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
373 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
374
375 // Convert from SrcFormat --> DstFormat
376 simdvector src;
377 LoadSOA<SrcFormat>(pSrc, src);
378 StoreSOA<DstFormat>(src, soaTile);
379
380 // Convert from SOA --> AOS
381 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
382
383 #endif
384 // Store data into destination
385 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
386 }
387 };
388
389 //////////////////////////////////////////////////////////////////////////
390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391 /// Specialization for no format conversion
392 //////////////////////////////////////////////////////////////////////////
393 template<SWR_FORMAT Format>
394 struct ConvertPixelsSOAtoAOS<Format, Format>
395 {
396 //////////////////////////////////////////////////////////////////////////
397 /// @brief Converts a SIMD from the Hot Tile to the destination format
398 /// and converts from SOA to AOS.
399 /// @param pSrc - Pointer to raster tile.
400 /// @param pDst - Pointer to destination surface or deswizzling buffer.
401 template <size_t NumDests>
402 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
403 {
404 #if USE_8x2_TILE_BACKEND
405 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406
407 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408
409 // Convert from SOA --> AOS
410 FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411
412 #else
413 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
414
415 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
416
417 // Convert from SOA --> AOS
418 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
419
420 #endif
421 // Store data into destination
422 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
423 }
424 };
425
426 //////////////////////////////////////////////////////////////////////////
427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428 //////////////////////////////////////////////////////////////////////////
429 template<>
430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
431 {
432 //////////////////////////////////////////////////////////////////////////
433 /// @brief Converts a SIMD from the Hot Tile to the destination format
434 /// and converts from SOA to AOS.
435 /// @param pSrc - Pointer to raster tile.
436 /// @param pDst - Pointer to destination surface or deswizzling buffer.
437 template <size_t NumDests>
438 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
439 {
440 #if USE_8x2_TILE_BACKEND
441 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443
444 static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445
446 OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447
448 // Load hot-tile
449 simd16vector src, dst;
450 LoadSOA<SrcFormat>(pSrc, src);
451
452 // deswizzle
453 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456
457 // clamp
458 dst.x = Clamp<DstFormat>(dst.x, 0);
459 dst.y = Clamp<DstFormat>(dst.y, 1);
460 dst.z = Clamp<DstFormat>(dst.z, 2);
461
462 // normalize
463 dst.x = Normalize<DstFormat>(dst.x, 0);
464 dst.y = Normalize<DstFormat>(dst.y, 1);
465 dst.z = Normalize<DstFormat>(dst.z, 2);
466
467 // pack
468 simd16scalari packed = _simd16_castps_si(dst.x);
469
470 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471 SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472
473 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474 packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475
476 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477 uint32_t *pPacked = (uint32_t*)&packed;
478 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479 for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480 {
481 *pAosTile++ = *pPacked++;
482 }
483
484 #else
485 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
486 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
487 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
488
489 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
490
491 // Load hot-tile
492 simdvector src, dst;
493 LoadSOA<SrcFormat>(pSrc, src);
494
495 // deswizzle
496 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
497 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
498 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
499
500 // clamp
501 dst.x = Clamp<DstFormat>(dst.x, 0);
502 dst.y = Clamp<DstFormat>(dst.y, 1);
503 dst.z = Clamp<DstFormat>(dst.z, 2);
504
505 // normalize
506 dst.x = Normalize<DstFormat>(dst.x, 0);
507 dst.y = Normalize<DstFormat>(dst.y, 1);
508 dst.z = Normalize<DstFormat>(dst.z, 2);
509
510 // pack
511 simdscalari packed = _simd_castps_si(dst.x);
512 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
513 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
514 FormatTraits<DstFormat>::GetConstBPC(1)));
515
516 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517 uint32_t *pPacked = (uint32_t*)&packed;
518 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
519 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
520 {
521 *pAosTile++ = *pPacked++;
522 }
523
524 #endif
525 // Store data into destination
526 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
527 }
528 };
529
530 //////////////////////////////////////////////////////////////////////////
531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532 //////////////////////////////////////////////////////////////////////////
533 template<>
534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
535 {
536 static const SWR_FORMAT SrcFormat = R32_FLOAT;
537 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
538
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Converts a SIMD from the Hot Tile to the destination format
541 /// and converts from SOA to AOS.
542 /// @param pSrc - Pointer to raster tile.
543 /// @param pDst - Pointer to destination surface or deswizzling buffer.
544 template <size_t NumDests>
545 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
546 {
547 #if USE_8x2_TILE_BACKEND
548 simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
549
550 // clamp
551 const simd16scalar zero = _simd16_setzero_ps();
552 const simd16scalar ones = _simd16_set1_ps(1.0f);
553
554 comp = _simd16_max_ps(comp, zero);
555 comp = _simd16_min_ps(comp, ones);
556
557 // normalize
558 comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
559
560 simd16scalari temp = _simd16_cvtps_epi32(comp);
561
562 // swizzle
563 temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
564
565 // merge/store data into destination but don't overwrite the X8 bits
566 simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
567 simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
568
569 simd16scalari dest = _simd16_setzero_si();
570
571 dest = _simd16_insert_si(dest, destlo, 0);
572 dest = _simd16_insert_si(dest, desthi, 1);
573
574 simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
575
576 dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
577
578 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
579 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
580 #else
581 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
582
583 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
584 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
585
586 // Convert from SrcFormat --> DstFormat
587 simdvector src;
588 LoadSOA<SrcFormat>(pSrc, src);
589 StoreSOA<DstFormat>(src, soaTile);
590
591 // Convert from SOA --> AOS
592 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
593
594 // Store data into destination but don't overwrite the X8 bits
595 // Each 4-pixel row is 16-bytes
596 simd4scalari *pZRow01 = (simd4scalari*)aosTile;
597 simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
598 simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
599
600 simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
601 simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
602
603 simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
604 simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
605
606 simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
607
608 vDst0 = SIMD128::andnot_si(vMask, vDst0);
609 vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
610 vDst1 = SIMD128::andnot_si(vMask, vDst1);
611 vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
612
613 SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
614 SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
615 #endif
616 }
617 };
618
619 #if USE_8x2_TILE_BACKEND
620 template<SWR_FORMAT DstFormat>
621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622 {
623 // swizzle rgba -> bgra while we load
624 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627 simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628
629 // clamp
630 const simd16scalar zero = _simd16_setzero_ps();
631 const simd16scalar ones = _simd16_set1_ps(1.0f);
632
633 comp0 = _simd16_max_ps(comp0, zero);
634 comp0 = _simd16_min_ps(comp0, ones);
635
636 comp1 = _simd16_max_ps(comp1, zero);
637 comp1 = _simd16_min_ps(comp1, ones);
638
639 comp2 = _simd16_max_ps(comp2, zero);
640 comp2 = _simd16_min_ps(comp2, ones);
641
642 comp3 = _simd16_max_ps(comp3, zero);
643 comp3 = _simd16_min_ps(comp3, ones);
644
645 // gamma-correct only rgb
646 if (FormatTraits<DstFormat>::isSRGB)
647 {
648 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651 }
652
653 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657 comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658
659 // moving to 16 wide integer vector types
660 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663 simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664
665 // SOA to AOS conversion
666 src1 = _simd16_slli_epi32(src1, 8);
667 src2 = _simd16_slli_epi32(src2, 16);
668 src3 = _simd16_slli_epi32(src3, 24);
669
670 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671
672 // de-swizzle conversion
673 #if 1
674 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
676
677 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678
679 #else
680 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681
682 #endif
683 // store 8x2 memory order:
684 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
687 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
688 }
689
690 #endif
691 template<SWR_FORMAT DstFormat>
692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
693 {
694 static const uint32_t offset = sizeof(simdscalar);
695
696 // swizzle rgba -> bgra while we load
697 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
698 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
699 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
700 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
701
702 // clamp
703 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
704 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
705
706 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
707 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
708
709 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
710 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
711
712 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
713 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
714
715 if (FormatTraits<DstFormat>::isSRGB)
716 {
717 // Gamma-correct only rgb
718 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
719 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
720 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
721 }
722
723 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
725 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
726 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
727 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
728
729 // moving to 8 wide integer vector types
730 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
731 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
732 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
733 simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
734
735 #if KNOB_ARCH <= KNOB_ARCH_AVX
736
737 // splitting into two sets of 4 wide integer vector types
738 // because AVX doesn't have instructions to support this operation at 8 wide
739 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
740 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
741 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
742 simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
743
744 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
745 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
746 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
747 simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
748
749 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
750 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
751 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
752 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
753 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
754 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
755
756 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
757 srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
758
759 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
760 srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
761
762 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
763 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
764
765 // unpack into rows that get the tiling order correct
766 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
767 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
768
769 simdscalari final = _mm256_castsi128_si256(vRow00);
770 final = _mm256_insertf128_si256(final, vRow10, 1);
771
772 #else
773
774 // logic is as above, only wider
775 src1 = _mm256_slli_si256(src1, 1);
776 src2 = _mm256_slli_si256(src2, 2);
777 src3 = _mm256_slli_si256(src3, 3);
778
779 src0 = _mm256_or_si256(src0, src1);
780 src2 = _mm256_or_si256(src2, src3);
781
782 simdscalari final = _mm256_or_si256(src0, src2);
783
784 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
785 final = _mm256_permute4x64_epi64(final, 0xD8);
786 #endif
787
788 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
789 }
790
791 #if USE_8x2_TILE_BACKEND
792 template<SWR_FORMAT DstFormat>
793 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
794 {
795 // swizzle rgba -> bgra while we load
796 simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
797 simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
798 simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
799
800 // clamp
801 const simd16scalar zero = _simd16_setzero_ps();
802 const simd16scalar ones = _simd16_set1_ps(1.0f);
803
804 comp0 = _simd16_max_ps(comp0, zero);
805 comp0 = _simd16_min_ps(comp0, ones);
806
807 comp1 = _simd16_max_ps(comp1, zero);
808 comp1 = _simd16_min_ps(comp1, ones);
809
810 comp2 = _simd16_max_ps(comp2, zero);
811 comp2 = _simd16_min_ps(comp2, ones);
812
813 // gamma-correct only rgb
814 if (FormatTraits<DstFormat>::isSRGB)
815 {
816 comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
817 comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
818 comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
819 }
820
821 // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
822 comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
823 comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
824 comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
825
826 // moving to 16 wide integer vector types
827 simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
828 simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
829 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
830
831 // SOA to AOS conversion
832 src1 = _simd16_slli_epi32(src1, 8);
833 src2 = _simd16_slli_epi32(src2, 16);
834
835 simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
836
837 // de-swizzle conversion
838 #if 1
839 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
840 simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
841
842 final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
843
844 #else
845 final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
846
847 #endif
848 // store 8x2 memory order:
849 // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
850 // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
851 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
852 _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
853 }
854
855 #endif
856 template<SWR_FORMAT DstFormat>
857 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
858 {
859 static const uint32_t offset = sizeof(simdscalar);
860
861 // swizzle rgba -> bgra while we load
862 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
863 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
864 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
865 // clamp
866 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
867 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
868
869 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
870 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
871
872 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
873 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
874
875 if (FormatTraits<DstFormat>::isSRGB)
876 {
877 // Gamma-correct only rgb
878 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
879 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
880 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
881 }
882
883 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
884 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
885 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
886 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
887
888 // moving to 8 wide integer vector types
889 simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
890 simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
891 simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
892
893 #if KNOB_ARCH <= KNOB_ARCH_AVX
894
895 // splitting into two sets of 4 wide integer vector types
896 // because AVX doesn't have instructions to support this operation at 8 wide
897 simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
898 simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
899 simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
900
901 simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
902 simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
903 simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
904
905 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
906 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
907 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
908 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
909
910 srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
911
912 srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
913
914 srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
915 srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
916
917 // unpack into rows that get the tiling order correct
918 simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
919 simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
920
921 simdscalari final = _mm256_castsi128_si256(vRow00);
922 final = _mm256_insertf128_si256(final, vRow10, 1);
923
924 #else
925
926 // logic is as above, only wider
927 src1 = _mm256_slli_si256(src1, 1);
928 src2 = _mm256_slli_si256(src2, 2);
929
930 src0 = _mm256_or_si256(src0, src1);
931
932 simdscalari final = _mm256_or_si256(src0, src2);
933
934 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
935 final = _mm256_permute4x64_epi64(final, 0xD8);
936
937 #endif
938
939 _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
940 }
941
942 template<>
943 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
944 {
945 template <size_t NumDests>
946 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
947 {
948 #if USE_8x2_TILE_BACKEND
949 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
950 #else
951 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
952 #endif
953 }
954 };
955
956 template<>
957 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
958 {
959 template <size_t NumDests>
960 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
961 {
962 #if USE_8x2_TILE_BACKEND
963 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
964 #else
965 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
966 #endif
967 }
968 };
969
970 template<>
971 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
972 {
973 template <size_t NumDests>
974 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
975 {
976 #if USE_8x2_TILE_BACKEND
977 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
978 #else
979 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
980 #endif
981 }
982 };
983
984 template<>
985 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
986 {
987 template <size_t NumDests>
988 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
989 {
990 #if USE_8x2_TILE_BACKEND
991 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
992 #else
993 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
994 #endif
995 }
996 };
997
998 template<>
999 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1000 {
1001 template <size_t NumDests>
1002 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1003 {
1004 #if USE_8x2_TILE_BACKEND
1005 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1006 #else
1007 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1008 #endif
1009 }
1010 };
1011
1012 template<>
1013 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1014 {
1015 template <size_t NumDests>
1016 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1017 {
1018 #if USE_8x2_TILE_BACKEND
1019 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1020 #else
1021 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1022 #endif
1023 }
1024 };
1025
1026 template<>
1027 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1028 {
1029 template <size_t NumDests>
1030 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1031 {
1032 #if USE_8x2_TILE_BACKEND
1033 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1034 #else
1035 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1036 #endif
1037 }
1038 };
1039
1040 template<>
1041 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1042 {
1043 template <size_t NumDests>
1044 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1045 {
1046 #if USE_8x2_TILE_BACKEND
1047 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1048 #else
1049 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1050 #endif
1051 }
1052 };
1053
1054 //////////////////////////////////////////////////////////////////////////
1055 /// StoreRasterTile
1056 //////////////////////////////////////////////////////////////////////////
1057 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1058 struct StoreRasterTile
1059 {
1060 //////////////////////////////////////////////////////////////////////////
1061 /// @brief Retrieve color from hot tile source which is always float.
1062 /// @param pSrc - Pointer to raster tile.
1063 /// @param x, y - Coordinates to raster tile.
1064 /// @param output - output color
1065 INLINE static void GetSwizzledSrcColor(
1066 uint8_t* pSrc,
1067 uint32_t x, uint32_t y,
1068 float outputColor[4])
1069 {
1070 #if USE_8x2_TILE_BACKEND
1071 typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1072
1073 SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1074
1075 // Compute which simd tile we're accessing within 8x8 tile.
1076 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1077 uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1078
1079 SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1080
1081 uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1082
1083 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1084 #else
1085 typedef SimdTile<SrcFormat, DstFormat> SimdT;
1086
1087 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1088
1089 // Compute which simd tile we're accessing within 8x8 tile.
1090 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1091 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1092
1093 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1094
1095 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1096
1097 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1098 #endif
1099 }
1100
1101 //////////////////////////////////////////////////////////////////////////
1102 /// @brief Stores an 8x8 raster tile to the destination surface.
1103 /// @param pSrc - Pointer to raster tile.
1104 /// @param pDstSurface - Destination surface state
1105 /// @param x, y - Coordinates to raster tile.
1106 INLINE static void Store(
1107 uint8_t *pSrc,
1108 SWR_SURFACE_STATE* pDstSurface,
1109 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1110 {
1111 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1112 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1113
1114 // For each raster tile pixel (rx, ry)
1115 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1116 {
1117 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1118 {
1119 // Perform bounds checking.
1120 if (((x + rx) < lodWidth) &&
1121 ((y + ry) < lodHeight))
1122 {
1123 float srcColor[4];
1124 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1125
1126 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1127 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1128 sampleNum, pDstSurface->lod, pDstSurface);
1129 {
1130 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1131 }
1132 }
1133 }
1134 }
1135 }
1136
1137 //////////////////////////////////////////////////////////////////////////
1138 /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
1139 /// @param pSrc - Pointer to raster tile.
1140 /// @param pDstSurface - Destination surface state
1141 /// @param x, y - Coordinates to raster tile.
1142 /// @param sampleOffset - Offset between adjacent multisamples
1143 INLINE static void Resolve(
1144 uint8_t *pSrc,
1145 SWR_SURFACE_STATE* pDstSurface,
1146 uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1147 {
1148 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1149 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1150
1151 float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
1152
1153 // For each raster tile pixel (rx, ry)
1154 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1155 {
1156 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1157 {
1158 // Perform bounds checking.
1159 if (((x + rx) < lodWidth) &&
1160 ((y + ry) < lodHeight))
1161 {
1162 // Sum across samples
1163 float resolveColor[4] = {0};
1164 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1165 {
1166 float sampleColor[4] = {0};
1167 uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1168 GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1169 resolveColor[0] += sampleColor[0];
1170 resolveColor[1] += sampleColor[1];
1171 resolveColor[2] += sampleColor[2];
1172 resolveColor[3] += sampleColor[3];
1173 }
1174
1175 // Divide by numSamples to average
1176 resolveColor[0] *= oneOverNumSamples;
1177 resolveColor[1] *= oneOverNumSamples;
1178 resolveColor[2] *= oneOverNumSamples;
1179 resolveColor[3] *= oneOverNumSamples;
1180
1181 // Use the resolve surface state
1182 SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1183 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1184 pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1185 0, pResolveSurface->lod, pResolveSurface);
1186 {
1187 ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1188 }
1189 }
1190 }
1191 }
1192 }
1193
1194 };
1195
1196 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1197 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1198 {};
1199
1200 //////////////////////////////////////////////////////////////////////////
1201 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1202 //////////////////////////////////////////////////////////////////////////
1203 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1204 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1205 {
1206 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1207 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1208 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1209
1210 //////////////////////////////////////////////////////////////////////////
1211 /// @brief Stores an 8x8 raster tile to the destination surface.
1212 /// @param pSrc - Pointer to raster tile.
1213 /// @param pDstSurface - Destination surface state
1214 /// @param x, y - Coordinates to raster tile.
1215 INLINE static void Store(
1216 uint8_t *pSrc,
1217 SWR_SURFACE_STATE* pDstSurface,
1218 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1219 {
1220 // Punt non-full tiles to generic store
1221 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1222 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1223
1224 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1225 {
1226 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1227 }
1228
1229 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1230 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1231 #if USE_8x2_TILE_BACKEND
1232
1233 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1234 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1235
1236 uint8_t* ppDsts[] =
1237 {
1238 pDst, // row 0, col 0
1239 pDst + pDstSurface->pitch, // row 1, col 0
1240 pDst + dx / 2, // row 0, col 1
1241 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1242 };
1243
1244 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1245 {
1246 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1247 {
1248 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1249
1250 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1251
1252 ppDsts[0] += dx;
1253 ppDsts[1] += dx;
1254 ppDsts[2] += dx;
1255 ppDsts[3] += dx;
1256 }
1257
1258 ppDsts[0] += dy;
1259 ppDsts[1] += dy;
1260 ppDsts[2] += dy;
1261 ppDsts[3] += dy;
1262 }
1263 #else
1264 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1265
1266 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1267 {
1268 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1269
1270 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1271 {
1272 // Format conversion and convert from SOA to AOS, and store the rows.
1273 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1274
1275 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1276 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1277 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1278 }
1279
1280 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1281 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1282 }
1283 #endif
1284 }
1285 };
1286
1287 //////////////////////////////////////////////////////////////////////////
1288 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1289 //////////////////////////////////////////////////////////////////////////
1290 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1291 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1292 {
1293 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1294 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1295 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1296
1297 //////////////////////////////////////////////////////////////////////////
1298 /// @brief Stores an 8x8 raster tile to the destination surface.
1299 /// @param pSrc - Pointer to raster tile.
1300 /// @param pDstSurface - Destination surface state
1301 /// @param x, y - Coordinates to raster tile.
1302 INLINE static void Store(
1303 uint8_t *pSrc,
1304 SWR_SURFACE_STATE* pDstSurface,
1305 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1306 {
1307 // Punt non-full tiles to generic store
1308 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1309 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1310
1311 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1312 {
1313 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1314 }
1315
1316 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1317 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1318 #if USE_8x2_TILE_BACKEND
1319
1320 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1321 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1322
1323 uint8_t* ppDsts[] =
1324 {
1325 pDst, // row 0, col 0
1326 pDst + pDstSurface->pitch, // row 1, col 0
1327 pDst + dx / 2, // row 0, col 1
1328 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1329 };
1330
1331 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1332 {
1333 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1334 {
1335 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1336
1337 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1338
1339 ppDsts[0] += dx;
1340 ppDsts[1] += dx;
1341 ppDsts[2] += dx;
1342 ppDsts[3] += dx;
1343 }
1344
1345 ppDsts[0] += dy;
1346 ppDsts[1] += dy;
1347 ppDsts[2] += dy;
1348 ppDsts[3] += dy;
1349 }
1350 #else
1351 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1352
1353 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1354 {
1355 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1356
1357 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1358 {
1359 // Format conversion and convert from SOA to AOS, and store the rows.
1360 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1361
1362 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1363 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1364 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1365 }
1366
1367 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1368 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1369 }
1370 #endif
1371 }
1372 };
1373
1374 //////////////////////////////////////////////////////////////////////////
1375 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1376 //////////////////////////////////////////////////////////////////////////
1377 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1378 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1379 {
1380 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1381 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1382 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1383
1384 //////////////////////////////////////////////////////////////////////////
1385 /// @brief Stores an 8x8 raster tile to the destination surface.
1386 /// @param pSrc - Pointer to raster tile.
1387 /// @param pDstSurface - Destination surface state
1388 /// @param x, y - Coordinates to raster tile.
1389 INLINE static void Store(
1390 uint8_t *pSrc,
1391 SWR_SURFACE_STATE* pDstSurface,
1392 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1393 {
1394 // Punt non-full tiles to generic store
1395 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1396 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1397
1398 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1399 {
1400 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1401 }
1402
1403 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1404 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1405 #if USE_8x2_TILE_BACKEND
1406
1407 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1408 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1409
1410 uint8_t* ppDsts[] =
1411 {
1412 pDst, // row 0, col 0
1413 pDst + pDstSurface->pitch, // row 1, col 0
1414 pDst + dx / 2, // row 0, col 1
1415 pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
1416 };
1417
1418 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1419 {
1420 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1421 {
1422 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1423
1424 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1425
1426 ppDsts[0] += dx;
1427 ppDsts[1] += dx;
1428 ppDsts[2] += dx;
1429 ppDsts[3] += dx;
1430 }
1431
1432 ppDsts[0] += dy;
1433 ppDsts[1] += dy;
1434 ppDsts[2] += dy;
1435 ppDsts[3] += dy;
1436 }
1437 #else
1438 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1439
1440 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1441 {
1442 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1443
1444 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1445 {
1446 // Format conversion and convert from SOA to AOS, and store the rows.
1447 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1448
1449 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1450 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1451 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1452 }
1453
1454 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1455 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1456 }
1457 #endif
1458 }
1459 };
1460
1461 //////////////////////////////////////////////////////////////////////////
1462 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1463 //////////////////////////////////////////////////////////////////////////
1464 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1465 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1466 {
1467 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1468 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1469 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1470 static const size_t MAX_DST_COLUMN_BYTES = 16;
1471 #if !USE_8x2_TILE_BACKEND
1472 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1473 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1474 #endif
1475
1476 //////////////////////////////////////////////////////////////////////////
1477 /// @brief Stores an 8x8 raster tile to the destination surface.
1478 /// @param pSrc - Pointer to raster tile.
1479 /// @param pDstSurface - Destination surface state
1480 /// @param x, y - Coordinates to raster tile.
1481 INLINE static void Store(
1482 uint8_t *pSrc,
1483 SWR_SURFACE_STATE* pDstSurface,
1484 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1485 {
1486 // Punt non-full tiles to generic store
1487 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1488 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1489
1490 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1491 {
1492 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1493 }
1494
1495 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1496 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1497 #if USE_8x2_TILE_BACKEND
1498
1499 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1500 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1501
1502 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1503 static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1504
1505 uint8_t *ppDsts[] =
1506 {
1507 pDst, // row 0, col 0
1508 pDst + pDstSurface->pitch, // row 1, col 0
1509 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1510 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1511 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1512 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1513 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1514 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
1515 };
1516
1517 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1518 {
1519 // Raster tile width is same as simd16 tile width
1520 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1521
1522 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1523
1524 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1525
1526 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1527 {
1528 ppDsts[i] += dy;
1529 }
1530 }
1531 #else
1532 uint8_t* ppDsts[] =
1533 {
1534 pDst, // row 0, col 0
1535 pDst + pDstSurface->pitch, // row 1, col 0
1536 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1537 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1538 };
1539
1540 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1541 {
1542 uint8_t* ppStartRows[] =
1543 {
1544 ppDsts[0],
1545 ppDsts[1],
1546 ppDsts[2],
1547 ppDsts[3],
1548 };
1549
1550 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1551 {
1552 // Format conversion and convert from SOA to AOS, and store the rows.
1553 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1554
1555 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1556 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1557 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1558 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1559 pSrc += SRC_COLUMN_BYTES;
1560 }
1561
1562 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1563 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1564 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1565 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1566 }
1567 #endif
1568 }
1569 };
1570
1571 //////////////////////////////////////////////////////////////////////////
1572 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1573 //////////////////////////////////////////////////////////////////////////
1574 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1575 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1576 {
1577 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1578 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1579 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1580 static const size_t MAX_DST_COLUMN_BYTES = 16;
1581 #if !USE_8x2_TILE_BACKEND
1582 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1583 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1584 #endif
1585
1586 //////////////////////////////////////////////////////////////////////////
1587 /// @brief Stores an 8x8 raster tile to the destination surface.
1588 /// @param pSrc - Pointer to raster tile.
1589 /// @param pDstSurface - Destination surface state
1590 /// @param x, y - Coordinates to raster tile.
1591 INLINE static void Store(
1592 uint8_t *pSrc,
1593 SWR_SURFACE_STATE* pDstSurface,
1594 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1595 {
1596 // Punt non-full tiles to generic store
1597 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1598 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1599
1600 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1601 {
1602 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1603 }
1604
1605 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1606 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1607 #if USE_8x2_TILE_BACKEND
1608
1609 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1610 const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1611
1612 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1613 static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1614
1615 uint8_t* ppDsts[] =
1616 {
1617 pDst, // row 0, col 0
1618 pDst + pDstSurface->pitch, // row 1, col 0
1619 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
1620 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
1621 pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
1622 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
1623 pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
1624 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
1625 pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
1626 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
1627 pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
1628 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
1629 pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
1630 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
1631 pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
1632 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
1633 };
1634
1635 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1636 {
1637 // Raster tile width is same as simd16 tile width
1638 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1639
1640 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1641
1642 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1643
1644 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1645 {
1646 ppDsts[i] += dy;
1647 }
1648 }
1649 #else
1650 struct DstPtrs
1651 {
1652 uint8_t* ppDsts[8];
1653 } ptrs;
1654
1655 // Need 8 pointers, 4 columns of 2 rows each
1656 for (uint32_t y = 0; y < 2; ++y)
1657 {
1658 for (uint32_t x = 0; x < 4; ++x)
1659 {
1660 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1661 }
1662 }
1663
1664 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1665 {
1666 DstPtrs startPtrs = ptrs;
1667
1668 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1669 {
1670 // Format conversion and convert from SOA to AOS, and store the rows.
1671 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1672
1673 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1674 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1675 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1676 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1677 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1678 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1679 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1680 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1681 pSrc += SRC_COLUMN_BYTES;
1682 }
1683
1684 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1685 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1686 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1687 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1688 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1689 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1690 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1691 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1692 }
1693 #endif
1694 }
1695 };
1696
1697 //////////////////////////////////////////////////////////////////////////
1698 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1699 //////////////////////////////////////////////////////////////////////////
1700 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1701 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1702 {
1703 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1704 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1705
1706 //////////////////////////////////////////////////////////////////////////
1707 /// @brief Stores an 8x8 raster tile to the destination surface.
1708 /// @param pSrc - Pointer to raster tile.
1709 /// @param pDstSurface - Destination surface state
1710 /// @param x, y - Coordinates to raster tile.
1711 INLINE static void Store(
1712 uint8_t *pSrc,
1713 SWR_SURFACE_STATE* pDstSurface,
1714 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1715 {
1716 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1717
1718 // Punt non-full tiles to generic store
1719 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1720 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1721
1722 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1723 {
1724 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1725 }
1726
1727 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1728 // We can compute the offsets to each column within the raster tile once and increment from these.
1729 #if USE_8x2_TILE_BACKEND
1730 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1731 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1732 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1733
1734 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1735
1736 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1737 uint8_t *ppDsts[] =
1738 {
1739 pDst,
1740 pDst + DestRowWidthBytes,
1741 pDst + DestRowWidthBytes / 4,
1742 pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1743 };
1744
1745 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1746 {
1747 // Raster tile width is same as simd16 tile width
1748 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1749
1750 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1751
1752 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1753
1754 ppDsts[0] += dy;
1755 ppDsts[1] += dy;
1756 ppDsts[2] += dy;
1757 ppDsts[3] += dy;
1758 }
1759 #else
1760 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1761 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1762 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1763
1764 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1765 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1766
1767 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1768 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1769 {
1770 uint32_t rowOffset = row * DestRowWidthBytes;
1771
1772 uint8_t* pRow = pCol0 + rowOffset;
1773 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1774
1775 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1776 pSrc += pSrcInc;
1777
1778 ppDsts[0] += DestRowWidthBytes / 4;
1779 ppDsts[1] += DestRowWidthBytes / 4;
1780
1781 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1782 pSrc += pSrcInc;
1783 }
1784 #endif
1785 }
1786 };
1787
1788 //////////////////////////////////////////////////////////////////////////
1789 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1790 //////////////////////////////////////////////////////////////////////////
1791 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1792 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1793 {
1794 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1795 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1796
1797 //////////////////////////////////////////////////////////////////////////
1798 /// @brief Stores an 8x8 raster tile to the destination surface.
1799 /// @param pSrc - Pointer to raster tile.
1800 /// @param pDstSurface - Destination surface state
1801 /// @param x, y - Coordinates to raster tile.
1802 INLINE static void Store(
1803 uint8_t *pSrc,
1804 SWR_SURFACE_STATE* pDstSurface,
1805 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1806 {
1807 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1808
1809 // Punt non-full tiles to generic store
1810 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1811 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1812
1813 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1814 {
1815 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1816 }
1817
1818 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1819 // We can compute the offsets to each column within the raster tile once and increment from these.
1820 #if USE_8x2_TILE_BACKEND
1821 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1822 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1823 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1824
1825 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1826
1827 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1828 uint8_t *ppDsts[] =
1829 {
1830 pDst,
1831 pDst + DestRowWidthBytes,
1832 pDst + DestRowWidthBytes / 2,
1833 pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1834 };
1835
1836 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1837 {
1838 // Raster tile width is same as simd16 tile width
1839 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1840
1841 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1842
1843 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1844
1845 ppDsts[0] += dy;
1846 ppDsts[1] += dy;
1847 ppDsts[2] += dy;
1848 ppDsts[3] += dy;
1849 }
1850 #else
1851 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1852 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1853 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1854
1855 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1856 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1857
1858 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1859 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1860 {
1861 uint32_t rowOffset = row * DestRowWidthBytes;
1862
1863 uint8_t* pRow = pCol0 + rowOffset;
1864 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1865
1866 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1867 pSrc += pSrcInc;
1868
1869 ppDsts[0] += DestRowWidthBytes / 2;
1870 ppDsts[1] += DestRowWidthBytes / 2;
1871
1872 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1873 pSrc += pSrcInc;
1874 }
1875 #endif
1876 }
1877 };
1878
1879 //////////////////////////////////////////////////////////////////////////
1880 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1881 //////////////////////////////////////////////////////////////////////////
1882 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1883 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1884 {
1885 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1886 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1887 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1888
1889 //////////////////////////////////////////////////////////////////////////
1890 /// @brief Stores an 8x8 raster tile to the destination surface.
1891 /// @param pSrc - Pointer to raster tile.
1892 /// @param pDstSurface - Destination surface state
1893 /// @param x, y - Coordinates to raster tile.
1894 INLINE static void Store(
1895 uint8_t *pSrc,
1896 SWR_SURFACE_STATE* pDstSurface,
1897 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1898 {
1899 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1900
1901 // Punt non-full tiles to generic store
1902 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1903 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1904
1905 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1906 {
1907 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1908 }
1909
1910 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1911 // We can compute the offsets to each column within the raster tile once and increment from these.
1912 #if USE_8x2_TILE_BACKEND
1913 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1914 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1915
1916 const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1917 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1918
1919 uint8_t* ppDsts[] =
1920 {
1921 pDst, // row 0, col 0
1922 pDst + DestRowWidthBytes, // row 1, col 0
1923 pDst + dx / 2, // row 0, col 1
1924 pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
1925 };
1926
1927 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1928 {
1929 for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1930 {
1931 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1932
1933 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1934
1935 ppDsts[0] += dx;
1936 ppDsts[1] += dx;
1937 ppDsts[2] += dx;
1938 ppDsts[3] += dx;
1939 }
1940
1941 ppDsts[0] += dy;
1942 ppDsts[1] += dy;
1943 ppDsts[2] += dy;
1944 ppDsts[3] += dy;
1945 }
1946 #else
1947 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1948 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1949 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1950
1951 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1952 {
1953 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1954 {
1955 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1956
1957 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1958 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1959
1960 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1961 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1962 }
1963
1964 pRow0 += (DestRowWidthBytes * 2);
1965 pRow1 += (DestRowWidthBytes * 2);
1966 }
1967 #endif
1968 }
1969 };
1970
1971 //////////////////////////////////////////////////////////////////////////
1972 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1973 //////////////////////////////////////////////////////////////////////////
1974 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1975 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1976 {
1977 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1978 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1979
1980 //////////////////////////////////////////////////////////////////////////
1981 /// @brief Stores an 8x8 raster tile to the destination surface.
1982 /// @param pSrc - Pointer to raster tile.
1983 /// @param pDstSurface - Destination surface state
1984 /// @param x, y - Coordinates to raster tile.
1985 INLINE static void Store(
1986 uint8_t *pSrc,
1987 SWR_SURFACE_STATE* pDstSurface,
1988 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1989 {
1990 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1991 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1992
1993 // Punt non-full tiles to generic store
1994 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1995 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1996
1997 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1998 {
1999 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2000 }
2001
2002 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2003 // We can compute the offsets to each column within the raster tile once and increment from these.
2004 #if USE_8x2_TILE_BACKEND
2005 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2006 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2007 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2008
2009 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2010 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2011
2012 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2013 uint8_t *ppDsts[] =
2014 {
2015 pDst, // row 0, col 0
2016 pDst + DestRowWidthBytes, // row 1, col 0
2017 pDst + DestColumnBytes, // row 0, col 1
2018 pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
2019 };
2020
2021 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2022 {
2023 // Raster tile width is same as simd16 tile width
2024 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2025
2026 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2027
2028 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2029
2030 ppDsts[0] += dy;
2031 ppDsts[1] += dy;
2032 ppDsts[2] += dy;
2033 ppDsts[3] += dy;
2034 }
2035 #else
2036 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2037 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2038 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2039
2040 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2041 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2042
2043 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2044 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2045 {
2046 uint32_t rowOffset = row * DestRowWidthBytes;
2047
2048 uint8_t* pRow = pCol0 + rowOffset;
2049 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
2050
2051 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2052 pSrc += pSrcInc;
2053
2054 ppDsts[0] += DestColumnBytes;
2055 ppDsts[1] += DestColumnBytes;
2056
2057 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2058 pSrc += pSrcInc;
2059 }
2060 #endif
2061 }
2062 };
2063
2064 //////////////////////////////////////////////////////////////////////////
2065 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2066 //////////////////////////////////////////////////////////////////////////
2067 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2068 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2069 {
2070 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2071 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2072
2073 //////////////////////////////////////////////////////////////////////////
2074 /// @brief Stores an 8x8 raster tile to the destination surface.
2075 /// @param pSrc - Pointer to raster tile.
2076 /// @param pDstSurface - Destination surface state
2077 /// @param x, y - Coordinates to raster tile.
2078 INLINE static void Store(
2079 uint8_t *pSrc,
2080 SWR_SURFACE_STATE* pDstSurface,
2081 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2082 {
2083 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2084 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2085
2086 // Punt non-full tiles to generic store
2087 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2088 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2089
2090 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2091 {
2092 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2093 }
2094
2095 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2096 // We can compute the offsets to each column within the raster tile once and increment from these.
2097 #if USE_8x2_TILE_BACKEND
2098 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2099 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2100 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2101
2102 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2103 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2104
2105 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2106 uint8_t *ppDsts[] =
2107 {
2108 pDst, // row 0, col 0
2109 pDst + DestRowWidthBytes, // row 1, col 0
2110 pDst + DestColumnBytes, // row 0, col 1
2111 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2112 pDst + DestColumnBytes * 2, // row 0, col 2
2113 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2114 pDst + DestColumnBytes * 3, // row 0, col 3
2115 pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
2116 };
2117
2118 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2119 {
2120 // Raster tile width is same as simd16 tile width
2121 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2122
2123 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2124
2125 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2126
2127 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2128 {
2129 ppDsts[i] += dy;
2130 }
2131 }
2132 #else
2133 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2134 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2135 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2136 uint8_t* pCol1 = pCol0 + DestColumnBytes;
2137
2138 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2139 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2140 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2141
2142 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2143 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2144 {
2145 uint32_t rowOffset = row * DestRowWidthBytes;
2146 uint8_t* ppDsts[] =
2147 {
2148 pCol0 + rowOffset,
2149 pCol0 + rowOffset + DestRowWidthBytes,
2150 pCol1 + rowOffset,
2151 pCol1 + rowOffset + DestRowWidthBytes,
2152 };
2153
2154 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2155 pSrc += pSrcInc;
2156
2157 ppDsts[0] += DestColumnBytes * 2;
2158 ppDsts[1] += DestColumnBytes * 2;
2159 ppDsts[2] += DestColumnBytes * 2;
2160 ppDsts[3] += DestColumnBytes * 2;
2161
2162 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2163 pSrc += pSrcInc;
2164 }
2165 #endif
2166 }
2167 };
2168
2169 //////////////////////////////////////////////////////////////////////////
2170 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2171 //////////////////////////////////////////////////////////////////////////
2172 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2173 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2174 {
2175 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2176 #if USE_8x2_TILE_BACKEND
2177 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2178
2179 #else
2180 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2181 static const size_t TILE_Y_ROWS = 32;
2182 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2183
2184 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2185 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2186 static const size_t MAX_DST_COLUMN_BYTES = 16;
2187
2188 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2189 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2190
2191 #endif
2192 //////////////////////////////////////////////////////////////////////////
2193 /// @brief Stores an 8x8 raster tile to the destination surface.
2194 /// @param pSrc - Pointer to raster tile.
2195 /// @param pDstSurface - Destination surface state
2196 /// @param x, y - Coordinates to raster tile.
2197 INLINE static void Store(
2198 uint8_t *pSrc,
2199 SWR_SURFACE_STATE* pDstSurface,
2200 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2201 {
2202 #if USE_8x2_TILE_BACKEND
2203 static const uint32_t DestRowWidthBytes = 16; // 16B rows
2204 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
2205 #endif
2206
2207 // Punt non-full tiles to generic store
2208 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2209 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2210
2211 if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2212 {
2213 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2214 }
2215
2216 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2217 // We can compute the offsets to each column within the raster tile once and increment from these.
2218 #if USE_8x2_TILE_BACKEND
2219 // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2220 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2221 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2222
2223 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2224 const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2225
2226 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2227 uint8_t *ppDsts[] =
2228 {
2229 pDst, // row 0, col 0
2230 pDst + DestRowWidthBytes, // row 1, col 0
2231 pDst + DestColumnBytes, // row 0, col 1
2232 pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
2233 pDst + DestColumnBytes * 2, // row 0, col 2
2234 pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2235 pDst + DestColumnBytes * 3, // row 0, col 3
2236 pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2237 pDst + DestColumnBytes * 4, // row 0, col 4
2238 pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2239 pDst + DestColumnBytes * 5, // row 0, col 5
2240 pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2241 pDst + DestColumnBytes * 6, // row 0, col 6
2242 pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2243 pDst + DestColumnBytes * 7, // row 0, col 7
2244 pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
2245 };
2246
2247 for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2248 {
2249 // Raster tile width is same as simd16 tile width
2250 static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2251
2252 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2253
2254 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2255
2256 for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2257 {
2258 ppDsts[i] += dy;
2259 }
2260 }
2261 #else
2262 // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2263 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2264 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2265 struct DstPtrs
2266 {
2267 uint8_t* ppDsts[8];
2268 } ptrs;
2269
2270 // Need 8 pointers, 4 columns of 2 rows each
2271 for (uint32_t y = 0; y < 2; ++y)
2272 {
2273 for (uint32_t x = 0; x < 4; ++x)
2274 {
2275 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2276 }
2277 }
2278
2279 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2280 {
2281 DstPtrs startPtrs = ptrs;
2282
2283 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2284 {
2285 // Format conversion and convert from SOA to AOS, and store the rows.
2286 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2287
2288 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2289 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2290 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2291 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2292 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2293 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2294 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2295 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2296 pSrc += SRC_COLUMN_BYTES;
2297 }
2298
2299 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2300 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2301 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2302 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2303 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2304 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2305 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2306 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2307 }
2308 #endif
2309 }
2310 };
2311
2312 //////////////////////////////////////////////////////////////////////////
2313 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2314 //////////////////////////////////////////////////////////////////////////
2315 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2316 struct StoreMacroTile
2317 {
2318 //////////////////////////////////////////////////////////////////////////
2319 /// @brief Stores a macrotile to the destination surface using safe implementation.
2320 /// @param pSrc - Pointer to macro tile.
2321 /// @param pDstSurface - Destination surface state
2322 /// @param x, y - Coordinates to macro tile
2323 static void StoreGeneric(
2324 uint8_t *pSrcHotTile,
2325 SWR_SURFACE_STATE* pDstSurface,
2326 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2327 {
2328 PFN_STORE_TILES_INTERNAL pfnStore;
2329 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2330
2331 // Store each raster tile from the hot tile to the destination surface.
2332 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2333 {
2334 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2335 {
2336 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2337 {
2338 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2339 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2340 }
2341 }
2342 }
2343
2344 }
2345
2346 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2347 //////////////////////////////////////////////////////////////////////////
2348 /// @brief Stores a macrotile to the destination surface.
2349 /// @param pSrc - Pointer to macro tile.
2350 /// @param pDstSurface - Destination surface state
2351 /// @param x, y - Coordinates to macro tile
2352 static void Store(
2353 uint8_t *pSrcHotTile,
2354 SWR_SURFACE_STATE* pDstSurface,
2355 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2356 {
2357 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2358
2359 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2360 {
2361 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2362 0,
2363 0,
2364 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2365 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2366 sampleNum,
2367 pDstSurface->lod,
2368 pDstSurface);
2369
2370 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2371 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2372 (pDstSurface->bInterleavedSamples);
2373
2374 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2375 }
2376
2377 // Save original for pSrcHotTile resolve.
2378 uint8_t *pResolveSrcHotTile = pSrcHotTile;
2379
2380 // Store each raster tile from the hot tile to the destination surface.
2381 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2382 {
2383 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2384 {
2385 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2386 {
2387 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2388 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2389 }
2390 }
2391 }
2392
2393 if (pDstSurface->xpAuxBaseAddress)
2394 {
2395 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2396 // Store each raster tile from the hot tile to the destination surface.
2397 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2398 {
2399 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2400 {
2401 StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
2402 pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
2403 }
2404 }
2405 }
2406 }
2407 };
2408
2409 //////////////////////////////////////////////////////////////////////////
2410 /// InitStoreTilesTable - Helper for setting up the tables.
2411 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2412 void InitStoreTilesTableColor_Half1(
2413 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2414 {
2415 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2416 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2417 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2418 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2419 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2420 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2421 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2422 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2423 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2424 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2425 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2426 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2427 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2428 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2429 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2430 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2431 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2432 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2433 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2434 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2435 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2436 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2437 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2438 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2439 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2440 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2441 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2442 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2443 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2444 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2445 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2446 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2447 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2448 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2449 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2450 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2451 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2452 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2453 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2454 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2455 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2456 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2457 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2458 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2459 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2460 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2461 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2462 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2463 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2464 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2465 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2466 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2467 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2468 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2469 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2470 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2471 }
2472
2473 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2474 void InitStoreTilesTableColor_Half2(
2475 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2476 {
2477 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2478 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2479 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2480 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2481 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2482 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2483 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2484 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2485 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2486 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2487 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2488 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2489 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2490 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2491 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2492 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2493 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2494 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2495 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2496 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2497 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2498 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2499 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2500 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2501 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2502 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2503 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2504 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2505 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2506 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2507 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2508 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2509 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2510 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2511 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2512 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2513 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2514 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2515 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2516 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2517 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2518 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2519 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2520 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2521 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2522 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2523 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2524 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2525 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2526 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2527 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2528 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2529 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2530 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2531 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2532 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2533 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2534 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2535 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2536 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2537 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2538 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2539 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2540 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2541 }
2542
2543 //////////////////////////////////////////////////////////////////////////
2544 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2545 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2546 void InitStoreTilesTableDepth(
2547 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2548 {
2549 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2550 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2551 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2552 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2553 }
2554
2555 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2556 void InitStoreTilesTableStencil(
2557 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2558 {
2559 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2560 }