95a1adbc8db09c4b6eff94330e5178e177f71497
[mesa.git] / src / gallium / drivers / swr / rasterizer / memory / StoreTile.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35
36 #include "memory/TilingFunctions.h"
37 #include "memory/tilingtraits.h"
38 #include "memory/Convert.h"
39 #include "core/multisample.h"
40
41 #include <array>
42 #include <sstream>
43
44 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
45 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
46
47 //////////////////////////////////////////////////////////////////////////
48 /// Store Raster Tile Function Tables.
49 //////////////////////////////////////////////////////////////////////////
50 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53
54 void InitStoreTilesTable_Linear_1();
55 void InitStoreTilesTable_Linear_2();
56 void InitStoreTilesTable_TileX_1();
57 void InitStoreTilesTable_TileX_2();
58 void InitStoreTilesTable_TileY_1();
59 void InitStoreTilesTable_TileY_2();
60 void InitStoreTilesTable_TileW();
61 void InitStoreTilesTable();
62
63 //////////////////////////////////////////////////////////////////////////
64 /// StorePixels
65 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
66 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
67 /// @param ppDsts - Array of destination pointers. Each pointer is
68 /// to a single row of at most 16B.
69 /// @tparam NumDests - Number of destination pointers. Each pair of
70 /// pointers is for a 16-byte column of two rows.
71 //////////////////////////////////////////////////////////////////////////
72 template <size_t PixelSize, size_t NumDests>
73 struct StorePixels
74 {
75 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
76 };
77
78 //////////////////////////////////////////////////////////////////////////
79 /// StorePixels (32-bit pixel specialization)
80 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
81 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
82 /// @param ppDsts - Array of destination pointers. Each pointer is
83 /// to a single row of at most 16B.
84 /// @tparam NumDests - Number of destination pointers. Each pair of
85 /// pointers is for a 16-byte column of two rows.
86 //////////////////////////////////////////////////////////////////////////
87 template <>
88 struct StorePixels<8, 2>
89 {
90 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
91 {
92 // Each 4-pixel row is 4 bytes.
93 const uint16_t* pPixSrc = (const uint16_t*)pSrc;
94
95 // Unswizzle from SWR-Z order
96 uint16_t* pRow = (uint16_t*)ppDsts[0];
97 pRow[0] = pPixSrc[0];
98 pRow[1] = pPixSrc[2];
99
100 pRow = (uint16_t*)ppDsts[1];
101 pRow[0] = pPixSrc[1];
102 pRow[1] = pPixSrc[3];
103 }
104 };
105
106 //////////////////////////////////////////////////////////////////////////
107 /// StorePixels (32-bit pixel specialization)
108 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
109 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
110 /// @param ppDsts - Array of destination pointers. Each pointer is
111 /// to a single row of at most 16B.
112 /// @tparam NumDests - Number of destination pointers. Each pair of
113 /// pointers is for a 16-byte column of two rows.
114 //////////////////////////////////////////////////////////////////////////
115 template <>
116 struct StorePixels<16, 2>
117 {
118 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
119 {
120 // Each 4-pixel row is 8 bytes.
121 const uint32_t* pPixSrc = (const uint32_t*)pSrc;
122
123 // Unswizzle from SWR-Z order
124 uint32_t* pRow = (uint32_t*)ppDsts[0];
125 pRow[0] = pPixSrc[0];
126 pRow[1] = pPixSrc[2];
127
128 pRow = (uint32_t*)ppDsts[1];
129 pRow[0] = pPixSrc[1];
130 pRow[1] = pPixSrc[3];
131 }
132 };
133
134 //////////////////////////////////////////////////////////////////////////
135 /// StorePixels (32-bit pixel specialization)
136 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
137 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
138 /// @param ppDsts - Array of destination pointers. Each pointer is
139 /// to a single row of at most 16B.
140 /// @tparam NumDests - Number of destination pointers. Each pair of
141 /// pointers is for a 16-byte column of two rows.
142 //////////////////////////////////////////////////////////////////////////
143 template <>
144 struct StorePixels<32, 2>
145 {
146 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
147 {
148 // Each 4-pixel row is 16-bytes
149 __m128i *pZRow01 = (__m128i*)pSrc;
150 __m128i vQuad00 = _mm_load_si128(pZRow01);
151 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
152
153 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
154 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
155
156 _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
157 _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
158 }
159 };
160
161 //////////////////////////////////////////////////////////////////////////
162 /// StorePixels (32-bit pixel specialization)
163 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
164 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
165 /// @param ppDsts - Array of destination pointers. Each pointer is
166 /// to a single row of at most 16B.
167 /// @tparam NumDests - Number of destination pointers. Each pair of
168 /// pointers is for a 16-byte column of two rows.
169 //////////////////////////////////////////////////////////////////////////
170 template <>
171 struct StorePixels<64, 4>
172 {
173 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
174 {
175 // Each 4-pixel row is 32 bytes.
176 const __m128i* pPixSrc = (const __m128i*)pSrc;
177
178 // order of pointers match SWR-Z layout
179 __m128i** pvDsts = (__m128i**)&ppDsts[0];
180 *pvDsts[0] = pPixSrc[0];
181 *pvDsts[1] = pPixSrc[1];
182 *pvDsts[2] = pPixSrc[2];
183 *pvDsts[3] = pPixSrc[3];
184 }
185 };
186
187 //////////////////////////////////////////////////////////////////////////
188 /// StorePixels (32-bit pixel specialization)
189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190 /// @param pSrc - Pointer to source raster tile in SWRZ pixel order
191 /// @param ppDsts - Array of destination pointers. Each pointer is
192 /// to a single row of at most 16B.
193 /// @tparam NumDests - Number of destination pointers. Each pair of
194 /// pointers is for a 16-byte column of two rows.
195 //////////////////////////////////////////////////////////////////////////
196 template <>
197 struct StorePixels<128, 8>
198 {
199 static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
200 {
201 // Each 4-pixel row is 64 bytes.
202 const __m128i* pPixSrc = (const __m128i*)pSrc;
203
204 // Unswizzle from SWR-Z order
205 __m128i** pvDsts = (__m128i**)&ppDsts[0];
206 *pvDsts[0] = pPixSrc[0];
207 *pvDsts[1] = pPixSrc[2];
208 *pvDsts[2] = pPixSrc[1];
209 *pvDsts[3] = pPixSrc[3];
210 *pvDsts[4] = pPixSrc[4];
211 *pvDsts[5] = pPixSrc[6];
212 *pvDsts[6] = pPixSrc[5];
213 *pvDsts[7] = pPixSrc[7];
214 }
215 };
216
217 //////////////////////////////////////////////////////////////////////////
218 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
219 //////////////////////////////////////////////////////////////////////////
220 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
221 struct ConvertPixelsSOAtoAOS
222 {
223 //////////////////////////////////////////////////////////////////////////
224 /// @brief Converts a SIMD from the Hot Tile to the destination format
225 /// and converts from SOA to AOS.
226 /// @param pSrc - Pointer to raster tile.
227 /// @param pDst - Pointer to destination surface or deswizzling buffer.
228 template <size_t NumDests>
229 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
230 {
231 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
232
233 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
234 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
235
236 // Convert from SrcFormat --> DstFormat
237 simdvector src;
238 LoadSOA<SrcFormat>(pSrc, src);
239 StoreSOA<DstFormat>(src, soaTile);
240
241 // Convert from SOA --> AOS
242 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
243
244 // Store data into destination
245 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
246 }
247 };
248
249 //////////////////////////////////////////////////////////////////////////
250 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
251 /// Specialization for no format conversion
252 //////////////////////////////////////////////////////////////////////////
253 template<SWR_FORMAT Format>
254 struct ConvertPixelsSOAtoAOS<Format, Format>
255 {
256 //////////////////////////////////////////////////////////////////////////
257 /// @brief Converts a SIMD from the Hot Tile to the destination format
258 /// and converts from SOA to AOS.
259 /// @param pSrc - Pointer to raster tile.
260 /// @param pDst - Pointer to destination surface or deswizzling buffer.
261 template <size_t NumDests>
262 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
263 {
264 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
265
266 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
267
268 // Convert from SOA --> AOS
269 FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
270
271 // Store data into destination
272 StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
273 }
274 };
275
276 //////////////////////////////////////////////////////////////////////////
277 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
278 //////////////////////////////////////////////////////////////////////////
279 template<>
280 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
281 {
282 //////////////////////////////////////////////////////////////////////////
283 /// @brief Converts a SIMD from the Hot Tile to the destination format
284 /// and converts from SOA to AOS.
285 /// @param pSrc - Pointer to raster tile.
286 /// @param pDst - Pointer to destination surface or deswizzling buffer.
287 template <size_t NumDests>
288 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
289 {
290 static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
291 static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
292 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
293
294 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
295
296 // Load hot-tile
297 simdvector src, dst;
298 LoadSOA<SrcFormat>(pSrc, src);
299
300 // deswizzle
301 dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
302 dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
303 dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
304
305 // clamp
306 dst.x = Clamp<DstFormat>(dst.x, 0);
307 dst.y = Clamp<DstFormat>(dst.y, 1);
308 dst.z = Clamp<DstFormat>(dst.z, 2);
309
310 // normalize
311 dst.x = Normalize<DstFormat>(dst.x, 0);
312 dst.y = Normalize<DstFormat>(dst.y, 1);
313 dst.z = Normalize<DstFormat>(dst.z, 2);
314
315 // pack
316 simdscalari packed = _simd_castps_si(dst.x);
317 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
318 packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
319 FormatTraits<DstFormat>::GetBPC(1)));
320
321 // pack low 16 bits of each 32 bit lane to low 128 bits of dst
322 uint32_t *pPacked = (uint32_t*)&packed;
323 uint16_t *pAosTile = (uint16_t*)&aosTile[0];
324 for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
325 {
326 *pAosTile++ = *pPacked++;
327 }
328
329 // Store data into destination
330 StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
331 }
332 };
333
334 //////////////////////////////////////////////////////////////////////////
335 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
336 //////////////////////////////////////////////////////////////////////////
337 template<>
338 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
339 {
340 static const SWR_FORMAT SrcFormat = R32_FLOAT;
341 static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
342
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief Converts a SIMD from the Hot Tile to the destination format
345 /// and converts from SOA to AOS.
346 /// @param pSrc - Pointer to raster tile.
347 /// @param pDst - Pointer to destination surface or deswizzling buffer.
348 template <size_t NumDests>
349 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
350 {
351 static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
352
353 OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
354 OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
355
356 // Convert from SrcFormat --> DstFormat
357 simdvector src;
358 LoadSOA<SrcFormat>(pSrc, src);
359 StoreSOA<DstFormat>(src, soaTile);
360
361 // Convert from SOA --> AOS
362 FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
363
364 // Store data into destination but don't overwrite the X8 bits
365 // Each 4-pixel row is 16-bytes
366 __m128i *pZRow01 = (__m128i*)aosTile;
367 __m128i vQuad00 = _mm_load_si128(pZRow01);
368 __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
369
370 __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
371 __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
372
373 __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
374 __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
375
376 __m128i vMask = _mm_set1_epi32(0xFFFFFF);
377
378 vDst0 = _mm_andnot_si128(vMask, vDst0);
379 vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
380 vDst1 = _mm_andnot_si128(vMask, vDst1);
381 vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
382
383 _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
384 _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
385 }
386 };
387
388 template<SWR_FORMAT DstFormat>
389 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
390 {
391 static const uint32_t offset = sizeof(simdscalar);
392
393 // swizzle rgba -> bgra while we load
394 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
395 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
396 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
397 simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
398
399 // clamp
400 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
401 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
402
403 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
404 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
405
406 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
407 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
408
409 vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
410 vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
411
412 if (FormatTraits<DstFormat>::isSRGB)
413 {
414 // Gamma-correct only rgb
415 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
416 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
417 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
418 }
419
420 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
421 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
422 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
423 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
424 vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
425
426 // moving to 8 wide integer vector types
427 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
428 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
429 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
430 __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
431
432 #if KNOB_ARCH == KNOB_ARCH_AVX
433
434 // splitting into two sets of 4 wide integer vector types
435 // because AVX doesn't have instructions to support this operation at 8 wide
436 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
437 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
438 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
439 __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
440
441 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
442 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
443 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
444 __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
445
446 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
447 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
448 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
449 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
450 srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
451 srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
452
453 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
454 srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
455
456 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
457 srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
458
459 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
460 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
461
462 // unpack into rows that get the tiling order correct
463 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
464 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
465
466 __m256i final = _mm256_castsi128_si256(vRow00);
467 final = _mm256_insertf128_si256(final, vRow10, 1);
468
469 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
470
471 // logic is as above, only wider
472 src1 = _mm256_slli_si256(src1, 1);
473 src2 = _mm256_slli_si256(src2, 2);
474 src3 = _mm256_slli_si256(src3, 3);
475
476 src0 = _mm256_or_si256(src0, src1);
477 src2 = _mm256_or_si256(src2, src3);
478
479 __m256i final = _mm256_or_si256(src0, src2);
480
481 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
482 final = _mm256_permute4x64_epi64(final, 0xD8);
483
484 #endif
485
486 _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
487 }
488
489 template<SWR_FORMAT DstFormat>
490 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
491 {
492 static const uint32_t offset = sizeof(simdscalar);
493
494 // swizzle rgba -> bgra while we load
495 simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
496 simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
497 simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
498 // clamp
499 vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
500 vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
501
502 vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
503 vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
504
505 vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
506 vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
507
508 if (FormatTraits<DstFormat>::isSRGB)
509 {
510 // Gamma-correct only rgb
511 vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
512 vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
513 vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
514 }
515
516 // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
517 vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
518 vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
519 vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
520
521 // moving to 8 wide integer vector types
522 __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
523 __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
524 __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
525
526 #if KNOB_ARCH == KNOB_ARCH_AVX
527
528 // splitting into two sets of 4 wide integer vector types
529 // because AVX doesn't have instructions to support this operation at 8 wide
530 __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
531 __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
532 __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
533
534 __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
535 __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
536 __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
537
538 srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
539 srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
540 srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
541 srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
542
543 srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
544
545 srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
546
547 srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
548 srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
549
550 // unpack into rows that get the tiling order correct
551 __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
552 __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
553
554 __m256i final = _mm256_castsi128_si256(vRow00);
555 final = _mm256_insertf128_si256(final, vRow10, 1);
556
557 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
558
559 // logic is as above, only wider
560 src1 = _mm256_slli_si256(src1, 1);
561 src2 = _mm256_slli_si256(src2, 2);
562
563 src0 = _mm256_or_si256(src0, src1);
564
565 __m256i final = _mm256_or_si256(src0, src2);
566
567 // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
568 final = _mm256_permute4x64_epi64(final, 0xD8);
569
570 #endif
571
572 _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
573 }
574
575 template<>
576 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
577 {
578 template <size_t NumDests>
579 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
580 {
581 FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
582 }
583 };
584
585 template<>
586 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
587 {
588 template <size_t NumDests>
589 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
590 {
591 FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
592 }
593 };
594
595 template<>
596 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
597 {
598 template <size_t NumDests>
599 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
600 {
601 FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
602 }
603 };
604
605 template<>
606 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
607 {
608 template <size_t NumDests>
609 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
610 {
611 FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
612 }
613 };
614
615 template<>
616 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
617 {
618 template <size_t NumDests>
619 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
620 {
621 FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
622 }
623 };
624
625 template<>
626 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
627 {
628 template <size_t NumDests>
629 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
630 {
631 FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
632 }
633 };
634
635 template<>
636 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
637 {
638 template <size_t NumDests>
639 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
640 {
641 FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
642 }
643 };
644
645 template<>
646 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
647 {
648 template <size_t NumDests>
649 INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
650 {
651 FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
652 }
653 };
654
655 //////////////////////////////////////////////////////////////////////////
656 /// StoreRasterTile
657 //////////////////////////////////////////////////////////////////////////
658 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
659 struct StoreRasterTile
660 {
661 //////////////////////////////////////////////////////////////////////////
662 /// @brief Retrieve color from hot tile source which is always float.
663 /// @param pSrc - Pointer to raster tile.
664 /// @param x, y - Coordinates to raster tile.
665 /// @param output - output color
666 INLINE static void GetSwizzledSrcColor(
667 uint8_t* pSrc,
668 uint32_t x, uint32_t y,
669 float outputColor[4])
670 {
671 typedef SimdTile<SrcFormat, DstFormat> SimdT;
672
673 SimdT* pSrcSimdTiles = (SimdT*)pSrc;
674
675 // Compute which simd tile we're accessing within 8x8 tile.
676 // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
677 uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
678
679 SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
680
681 uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
682
683 pSimdTile->GetSwizzledColor(simdOffset, outputColor);
684 }
685
686 //////////////////////////////////////////////////////////////////////////
687 /// @brief Stores an 8x8 raster tile to the destination surface.
688 /// @param pSrc - Pointer to raster tile.
689 /// @param pDstSurface - Destination surface state
690 /// @param x, y - Coordinates to raster tile.
691 INLINE static void Store(
692 uint8_t *pSrc,
693 SWR_SURFACE_STATE* pDstSurface,
694 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
695 {
696 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
697 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
698
699 // For each raster tile pixel (rx, ry)
700 for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
701 {
702 for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
703 {
704 // Perform bounds checking.
705 if (((x + rx) < lodWidth) &&
706 ((y + ry) < lodHeight))
707 {
708 float srcColor[4];
709 GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
710
711 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
712 pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
713 sampleNum, pDstSurface->lod, pDstSurface);
714 {
715 ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
716 }
717 }
718 }
719 }
720 }
721 };
722
723 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
724 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
725 {};
726
727 //////////////////////////////////////////////////////////////////////////
728 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
729 //////////////////////////////////////////////////////////////////////////
730 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
731 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
732 {
733 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
734 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
735 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
736
737 //////////////////////////////////////////////////////////////////////////
738 /// @brief Stores an 8x8 raster tile to the destination surface.
739 /// @param pSrc - Pointer to raster tile.
740 /// @param pDstSurface - Destination surface state
741 /// @param x, y - Coordinates to raster tile.
742 INLINE static void Store(
743 uint8_t *pSrc,
744 SWR_SURFACE_STATE* pDstSurface,
745 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
746 {
747 // Punt non-full tiles to generic store
748 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
749 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
750 if (x + KNOB_TILE_X_DIM > lodWidth ||
751 y + KNOB_TILE_Y_DIM > lodHeight)
752 {
753 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
754 }
755
756 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
757 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
758 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
759
760 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
761 {
762 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
763
764 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
765 {
766 // Format conversion and convert from SOA to AOS, and store the rows.
767 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
768
769 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
770 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
771 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
772 }
773
774 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
775 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
776 }
777 }
778 };
779
780 //////////////////////////////////////////////////////////////////////////
781 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
782 //////////////////////////////////////////////////////////////////////////
783 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
784 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
785 {
786 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
787 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
788 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
789
790 //////////////////////////////////////////////////////////////////////////
791 /// @brief Stores an 8x8 raster tile to the destination surface.
792 /// @param pSrc - Pointer to raster tile.
793 /// @param pDstSurface - Destination surface state
794 /// @param x, y - Coordinates to raster tile.
795 INLINE static void Store(
796 uint8_t *pSrc,
797 SWR_SURFACE_STATE* pDstSurface,
798 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
799 {
800 // Punt non-full tiles to generic store
801 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
802 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
803 if (x + KNOB_TILE_X_DIM > lodWidth ||
804 y + KNOB_TILE_Y_DIM > lodHeight)
805 {
806 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
807 }
808
809 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
810 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
811 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
812
813 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
814 {
815 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
816
817 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
818 {
819 // Format conversion and convert from SOA to AOS, and store the rows.
820 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
821
822 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
823 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
824 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
825 }
826
827 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
828 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
829 }
830 }
831 };
832
833 //////////////////////////////////////////////////////////////////////////
834 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
835 //////////////////////////////////////////////////////////////////////////
836 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
837 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
838 {
839 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
840 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
841 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
842
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief Stores an 8x8 raster tile to the destination surface.
845 /// @param pSrc - Pointer to raster tile.
846 /// @param pDstSurface - Destination surface state
847 /// @param x, y - Coordinates to raster tile.
848 INLINE static void Store(
849 uint8_t *pSrc,
850 SWR_SURFACE_STATE* pDstSurface,
851 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
852 {
853 // Punt non-full tiles to generic store
854 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
855 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
856 if (x + KNOB_TILE_X_DIM > lodWidth ||
857 y + KNOB_TILE_Y_DIM > lodHeight)
858 {
859 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
860 }
861
862 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
863 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
864 uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
865
866 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
867 {
868 uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
869
870 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
871 {
872 // Format conversion and convert from SOA to AOS, and store the rows.
873 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
874
875 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
876 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
877 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
878 }
879
880 ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
881 ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
882 }
883 }
884 };
885
886 //////////////////////////////////////////////////////////////////////////
887 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
888 //////////////////////////////////////////////////////////////////////////
889 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
890 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
891 {
892 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
893 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
894 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
895 static const size_t MAX_DST_COLUMN_BYTES = 16;
896 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
897 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
898
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief Stores an 8x8 raster tile to the destination surface.
901 /// @param pSrc - Pointer to raster tile.
902 /// @param pDstSurface - Destination surface state
903 /// @param x, y - Coordinates to raster tile.
904 INLINE static void Store(
905 uint8_t *pSrc,
906 SWR_SURFACE_STATE* pDstSurface,
907 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
908 {
909 // Punt non-full tiles to generic store
910 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
911 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
912 if (x + KNOB_TILE_X_DIM > lodWidth ||
913 y + KNOB_TILE_Y_DIM > lodHeight)
914 {
915 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
916 }
917
918 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
919 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
920 uint8_t* ppDsts[] =
921 {
922 pDst, // row 0, col 0
923 pDst + pDstSurface->pitch, // row 1, col 0
924 pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
925 pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
926 };
927
928 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
929 {
930 uint8_t* ppStartRows[] =
931 {
932 ppDsts[0],
933 ppDsts[1],
934 ppDsts[2],
935 ppDsts[3],
936 };
937
938 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
939 {
940 // Format conversion and convert from SOA to AOS, and store the rows.
941 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
942
943 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
944 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
945 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
946 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
947 pSrc += SRC_COLUMN_BYTES;
948 }
949
950 ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
951 ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
952 ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
953 ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
954 }
955 }
956 };
957
958 //////////////////////////////////////////////////////////////////////////
959 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
960 //////////////////////////////////////////////////////////////////////////
961 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
962 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
963 {
964 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
965 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
966 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
967 static const size_t MAX_DST_COLUMN_BYTES = 16;
968 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
969 static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
970
971 //////////////////////////////////////////////////////////////////////////
972 /// @brief Stores an 8x8 raster tile to the destination surface.
973 /// @param pSrc - Pointer to raster tile.
974 /// @param pDstSurface - Destination surface state
975 /// @param x, y - Coordinates to raster tile.
976 INLINE static void Store(
977 uint8_t *pSrc,
978 SWR_SURFACE_STATE* pDstSurface,
979 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
980 {
981 // Punt non-full tiles to generic store
982 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
983 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
984 if (x + KNOB_TILE_X_DIM > lodWidth ||
985 y + KNOB_TILE_Y_DIM > lodHeight)
986 {
987 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
988 }
989
990 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
991 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
992 struct DstPtrs
993 {
994 uint8_t* ppDsts[8];
995 } ptrs;
996
997 // Need 8 pointers, 4 columns of 2 rows each
998 for (uint32_t y = 0; y < 2; ++y)
999 {
1000 for (uint32_t x = 0; x < 4; ++x)
1001 {
1002 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1003 }
1004 }
1005
1006 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1007 {
1008 DstPtrs startPtrs = ptrs;
1009
1010 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1011 {
1012 // Format conversion and convert from SOA to AOS, and store the rows.
1013 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1014
1015 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1016 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1017 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1018 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1019 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1020 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1021 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1022 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1023 pSrc += SRC_COLUMN_BYTES;
1024 }
1025
1026 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1027 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1028 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1029 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1030 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1031 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1032 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1033 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1034 }
1035 }
1036 };
1037
1038 //////////////////////////////////////////////////////////////////////////
1039 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1040 //////////////////////////////////////////////////////////////////////////
1041 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1042 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1043 {
1044 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1045
1046 //////////////////////////////////////////////////////////////////////////
1047 /// @brief Stores an 8x8 raster tile to the destination surface.
1048 /// @param pSrc - Pointer to raster tile.
1049 /// @param pDstSurface - Destination surface state
1050 /// @param x, y - Coordinates to raster tile.
1051 INLINE static void Store(
1052 uint8_t *pSrc,
1053 SWR_SURFACE_STATE* pDstSurface,
1054 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1055 {
1056 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1057
1058 // Punt non-full tiles to generic store
1059 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1060 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1061 if (x + KNOB_TILE_X_DIM > lodWidth ||
1062 y + KNOB_TILE_Y_DIM > lodHeight)
1063 {
1064 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1065 }
1066
1067 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1068 // We can compute the offsets to each column within the raster tile once and increment from these.
1069 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1070 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1071 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1072
1073 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1074 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1075
1076 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1077 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1078 {
1079 uint32_t rowOffset = row * DestRowWidthBytes;
1080
1081 uint8_t* pRow = pCol0 + rowOffset;
1082 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1083
1084 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1085 pSrc += pSrcInc;
1086
1087 ppDsts[0] += DestRowWidthBytes / 4;
1088 ppDsts[1] += DestRowWidthBytes / 4;
1089
1090 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1091 pSrc += pSrcInc;
1092 }
1093 }
1094 };
1095
1096 //////////////////////////////////////////////////////////////////////////
1097 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1098 //////////////////////////////////////////////////////////////////////////
1099 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1100 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1101 {
1102 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief Stores an 8x8 raster tile to the destination surface.
1106 /// @param pSrc - Pointer to raster tile.
1107 /// @param pDstSurface - Destination surface state
1108 /// @param x, y - Coordinates to raster tile.
1109 INLINE static void Store(
1110 uint8_t *pSrc,
1111 SWR_SURFACE_STATE* pDstSurface,
1112 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1113 {
1114 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1115
1116 // Punt non-full tiles to generic store
1117 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1118 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1119 if (x + KNOB_TILE_X_DIM > lodWidth ||
1120 y + KNOB_TILE_Y_DIM > lodHeight)
1121 {
1122 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1123 }
1124
1125 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1126 // We can compute the offsets to each column within the raster tile once and increment from these.
1127 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1128 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1129 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1130
1131 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1132 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1133
1134 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1135 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1136 {
1137 uint32_t rowOffset = row * DestRowWidthBytes;
1138
1139 uint8_t* pRow = pCol0 + rowOffset;
1140 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1141
1142 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1143 pSrc += pSrcInc;
1144
1145 ppDsts[0] += DestRowWidthBytes / 2;
1146 ppDsts[1] += DestRowWidthBytes / 2;
1147
1148 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1149 pSrc += pSrcInc;
1150 }
1151 }
1152 };
1153
1154 //////////////////////////////////////////////////////////////////////////
1155 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1156 //////////////////////////////////////////////////////////////////////////
1157 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1158 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1159 {
1160 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1161
1162 //////////////////////////////////////////////////////////////////////////
1163 /// @brief Stores an 8x8 raster tile to the destination surface.
1164 /// @param pSrc - Pointer to raster tile.
1165 /// @param pDstSurface - Destination surface state
1166 /// @param x, y - Coordinates to raster tile.
1167 INLINE static void Store(
1168 uint8_t *pSrc,
1169 SWR_SURFACE_STATE* pDstSurface,
1170 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1171 {
1172 static const uint32_t DestRowWidthBytes = 512; // 512B rows
1173
1174 // Punt non-full tiles to generic store
1175 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1176 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1177 if (x + KNOB_TILE_X_DIM > lodWidth ||
1178 y + KNOB_TILE_Y_DIM > lodHeight)
1179 {
1180 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1181 }
1182
1183 // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1184 // We can compute the offsets to each column within the raster tile once and increment from these.
1185 uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1186 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1187 uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1188
1189 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1190 {
1191 for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1192 {
1193 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1194
1195 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1196 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1197
1198 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1199 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1200 }
1201
1202 pRow0 += (DestRowWidthBytes * 2);
1203 pRow1 += (DestRowWidthBytes * 2);
1204 }
1205 }
1206 };
1207
1208 //////////////////////////////////////////////////////////////////////////
1209 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1210 //////////////////////////////////////////////////////////////////////////
1211 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1212 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1213 {
1214 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1215
1216 //////////////////////////////////////////////////////////////////////////
1217 /// @brief Stores an 8x8 raster tile to the destination surface.
1218 /// @param pSrc - Pointer to raster tile.
1219 /// @param pDstSurface - Destination surface state
1220 /// @param x, y - Coordinates to raster tile.
1221 INLINE static void Store(
1222 uint8_t *pSrc,
1223 SWR_SURFACE_STATE* pDstSurface,
1224 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1225 {
1226 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1227 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1228
1229 // Punt non-full tiles to generic store
1230 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1231 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1232 if (x + KNOB_TILE_X_DIM > lodWidth ||
1233 y + KNOB_TILE_Y_DIM > lodHeight)
1234 {
1235 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1236 }
1237
1238 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1239 // We can compute the offsets to each column within the raster tile once and increment from these.
1240 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1241 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1242 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1243
1244 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1245 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1246
1247 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1248 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1249 {
1250 uint32_t rowOffset = row * DestRowWidthBytes;
1251
1252 uint8_t* pRow = pCol0 + rowOffset;
1253 uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1254
1255 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1256 pSrc += pSrcInc;
1257
1258 ppDsts[0] += DestColumnBytes;
1259 ppDsts[1] += DestColumnBytes;
1260
1261 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1262 pSrc += pSrcInc;
1263 }
1264 }
1265 };
1266
1267 //////////////////////////////////////////////////////////////////////////
1268 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1269 //////////////////////////////////////////////////////////////////////////
1270 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1271 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
1272 {
1273 typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
1274
1275 //////////////////////////////////////////////////////////////////////////
1276 /// @brief Stores an 8x8 raster tile to the destination surface.
1277 /// @param pSrc - Pointer to raster tile.
1278 /// @param pDstSurface - Destination surface state
1279 /// @param x, y - Coordinates to raster tile.
1280 INLINE static void Store(
1281 uint8_t *pSrc,
1282 SWR_SURFACE_STATE* pDstSurface,
1283 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1284 {
1285 static const uint32_t DestRowWidthBytes = 16; // 16B rows
1286 static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
1287
1288 // Punt non-full tiles to generic store
1289 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1290 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1291 if (x + KNOB_TILE_X_DIM > lodWidth ||
1292 y + KNOB_TILE_Y_DIM > lodHeight)
1293 {
1294 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1295 }
1296
1297 // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1298 // We can compute the offsets to each column within the raster tile once and increment from these.
1299 // There will be 2 x 4-wide columns in an 8x8 raster tile.
1300 uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1301 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1302 uint8_t* pCol1 = pCol0 + DestColumnBytes;
1303
1304 // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
1305 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1306 uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1307
1308 // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1309 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1310 {
1311 uint32_t rowOffset = row * DestRowWidthBytes;
1312 uint8_t* ppDsts[] =
1313 {
1314 pCol0 + rowOffset,
1315 pCol0 + rowOffset + DestRowWidthBytes,
1316 pCol1 + rowOffset,
1317 pCol1 + rowOffset + DestRowWidthBytes,
1318 };
1319
1320 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1321 pSrc += pSrcInc;
1322
1323 ppDsts[0] += DestColumnBytes * 2;
1324 ppDsts[1] += DestColumnBytes * 2;
1325 ppDsts[2] += DestColumnBytes * 2;
1326 ppDsts[3] += DestColumnBytes * 2;
1327
1328 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1329 pSrc += pSrcInc;
1330 }
1331 }
1332 };
1333
1334 //////////////////////////////////////////////////////////////////////////
1335 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1336 //////////////////////////////////////////////////////////////////////////
1337 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1338 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
1339 {
1340 typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1341
1342 static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
1343 static const size_t TILE_Y_ROWS = 32;
1344 static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
1345
1346 static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1347 static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1348 static const size_t MAX_DST_COLUMN_BYTES = 16;
1349
1350 static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1351 static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
1352
1353 //////////////////////////////////////////////////////////////////////////
1354 /// @brief Stores an 8x8 raster tile to the destination surface.
1355 /// @param pSrc - Pointer to raster tile.
1356 /// @param pDstSurface - Destination surface state
1357 /// @param x, y - Coordinates to raster tile.
1358 INLINE static void Store(
1359 uint8_t *pSrc,
1360 SWR_SURFACE_STATE* pDstSurface,
1361 uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1362 {
1363 // Punt non-full tiles to generic store
1364 uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1365 uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1366 if (x + KNOB_TILE_X_DIM > lodWidth ||
1367 y + KNOB_TILE_Y_DIM > lodHeight)
1368 {
1369 return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1370 }
1371
1372 uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1373 pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1374 struct DstPtrs
1375 {
1376 uint8_t* ppDsts[8];
1377 } ptrs;
1378
1379 // Need 8 pointers, 4 columns of 2 rows each
1380 for (uint32_t y = 0; y < 2; ++y)
1381 {
1382 for (uint32_t x = 0; x < 4; ++x)
1383 {
1384 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
1385 }
1386 }
1387
1388 for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1389 {
1390 DstPtrs startPtrs = ptrs;
1391
1392 for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1393 {
1394 // Format conversion and convert from SOA to AOS, and store the rows.
1395 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1396
1397 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1398 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1399 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1400 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1401 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1402 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1403 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1404 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1405 pSrc += SRC_COLUMN_BYTES;
1406 }
1407
1408 ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
1409 ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
1410 ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
1411 ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
1412 ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
1413 ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
1414 ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
1415 ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
1416 }
1417 }
1418 };
1419
1420 //////////////////////////////////////////////////////////////////////////
1421 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1422 //////////////////////////////////////////////////////////////////////////
1423 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1424 struct StoreMacroTile
1425 {
1426 //////////////////////////////////////////////////////////////////////////
1427 /// @brief Stores a macrotile to the destination surface using safe implementation.
1428 /// @param pSrc - Pointer to macro tile.
1429 /// @param pDstSurface - Destination surface state
1430 /// @param x, y - Coordinates to macro tile
1431 static void StoreGeneric(
1432 uint8_t *pSrcHotTile,
1433 SWR_SURFACE_STATE* pDstSurface,
1434 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1435 {
1436 PFN_STORE_TILES_INTERNAL pfnStore;
1437 pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1438
1439 // Store each raster tile from the hot tile to the destination surface.
1440 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1441 {
1442 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1443 {
1444 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1445 {
1446 pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1447 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1448 }
1449 }
1450 }
1451
1452 }
1453
1454 typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
1455 //////////////////////////////////////////////////////////////////////////
1456 /// @brief Stores a macrotile to the destination surface.
1457 /// @param pSrc - Pointer to macro tile.
1458 /// @param pDstSurface - Destination surface state
1459 /// @param x, y - Coordinates to macro tile
1460 static void Store(
1461 uint8_t *pSrcHotTile,
1462 SWR_SURFACE_STATE* pDstSurface,
1463 uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1464 {
1465 PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
1466
1467 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1468 {
1469 size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
1470 0,
1471 0,
1472 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
1473 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
1474 sampleNum,
1475 pDstSurface->lod,
1476 pDstSurface);
1477
1478 // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1479 bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
1480 (pDstSurface->bInterleavedSamples);
1481
1482 pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1483 }
1484
1485 // Store each raster tile from the hot tile to the destination surface.
1486 for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1487 {
1488 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1489 {
1490 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1491 {
1492 pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1493 pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1494 }
1495 }
1496 }
1497 }
1498 };
1499
1500 //////////////////////////////////////////////////////////////////////////
1501 /// InitStoreTilesTable - Helper for setting up the tables.
1502 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1503 void InitStoreTilesTableColor_Half1(
1504 PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
1505 {
1506 table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
1507 table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
1508 table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
1509 table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
1510 table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
1511 table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
1512 table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
1513 table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
1514 table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
1515 table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
1516 table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
1517 table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
1518 table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
1519 table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
1520 table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
1521 table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
1522 table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
1523 table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
1524 table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
1525 table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
1526 table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
1527 table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
1528 table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
1529 table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
1530 table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
1531 table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
1532 table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
1533 table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
1534 table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
1535 table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
1536 table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
1537 table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
1538 table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
1539 table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
1540 table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
1541 table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
1542 table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
1543 table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
1544 table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
1545 table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
1546 table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
1547 table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
1548 table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
1549 table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
1550 table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
1551 table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
1552 table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
1553 table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
1554 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
1555 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
1556 table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
1557 table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
1558 table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
1559 table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
1560 table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
1561 table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
1562 }
1563
1564 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1565 void InitStoreTilesTableColor_Half2(
1566 PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
1567 {
1568 table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
1569 table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
1570 table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
1571 table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
1572 table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
1573 table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
1574 table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
1575 table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
1576 table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
1577 table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
1578 table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
1579 table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
1580 table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
1581 table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
1582 table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
1583 table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
1584 table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
1585 table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
1586 table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
1587 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
1588 table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
1589 table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
1590 table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
1591 table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
1592 table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
1593 table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
1594 table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
1595 table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
1596 table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
1597 table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
1598 table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
1599 table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
1600 table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
1601 table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
1602 table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
1603 table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
1604 table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
1605 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
1606 table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
1607 table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
1608 table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
1609 table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
1610 table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
1611 table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
1612 table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
1613 table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
1614 table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
1615 table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
1616 table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
1617 table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
1618 table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
1619 table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
1620 table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
1621 table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
1622 table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
1623 table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
1624 table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
1625 table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
1626 table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
1627 table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
1628 table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
1629 table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
1630 table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
1631 table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
1632 }
1633
1634 //////////////////////////////////////////////////////////////////////////
1635 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
1636 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
1637 void InitStoreTilesTableDepth(
1638 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
1639 {
1640 table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
1641 table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
1642 table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
1643 }
1644
1645 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
1646 void InitStoreTilesTableStencil(
1647 PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
1648 {
1649 table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
1650 }