#if KNOB_SIMD_WIDTH == 8
INLINE
-void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
+void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
}
INLINE
-void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
+void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
- __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
+ simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
- __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
+ simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx
- __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
+ r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
- __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+ g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx
- _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
+ simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
}
#endif
};
vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
-#endif
vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
-#endif
}
#endif
};
vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
-#endif
vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
-#endif
}
#endif
};
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
- const float *pfSrc = reinterpret_cast<const float *>(pSrc);
-
- __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
- __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
- __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
- __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
-
- __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
- __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
- __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
- __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
-
- float *pfDst = reinterpret_cast<float *>(pDst);
-
- _mm256_store_ps(pfDst + 0, dst0);
- _mm256_store_ps(pfDst + 8, dst1);
- _mm256_store_ps(pfDst + 16, dst2);
- _mm256_store_ps(pfDst + 24, dst3);
+ simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
+ simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 8);
+ simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
+ simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24);
+
+ simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0);
+ simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0);
+ simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1);
+ simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1);
+
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0);
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 8, dst1);
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2);
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3);
}
#endif
};
simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari)));
- __m256i src_r = _simd16_extract_si(src_rg, 0);
- __m256i src_g = _simd16_extract_si(src_rg, 1);
- __m256i src_b = _simd16_extract_si(src_ba, 0);
- __m256i src_a = _simd16_extract_si(src_ba, 1);
-
- __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
- __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
- __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
- __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
-
- __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
- __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
- __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
- __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
-
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
+ simdscalari src_r = _simd16_extract_si(src_rg, 0);
+ simdscalari src_g = _simd16_extract_si(src_rg, 1);
+ simdscalari src_b = _simd16_extract_si(src_ba, 0);
+ simdscalari src_a = _simd16_extract_si(src_ba, 1);
+
+ simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
+ simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
+ simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
+ simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
+
+ simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
+ simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
+ simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
+ simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
+
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
}
#endif
};
{
simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
- __m256i src_r = _simd16_extract_si(src_rg, 0);
- __m256i src_g = _simd16_extract_si(src_rg, 1);
- __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc + sizeof(simd16scalari)));
- __m256i src_a = _mm256_undefined_si256();
-
- __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
- __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
- __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
- __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
-
- __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
- __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
- __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
- __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
-
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
+ simdscalari src_r = _simd16_extract_si(src_rg, 0);
+ simdscalari src_g = _simd16_extract_si(src_rg, 1);
+ simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari)));
+ simdscalari src_a = _mm256_undefined_si256();
+
+ simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
+ simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
+ simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
+ simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
+
+ simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
+ simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
+ simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
+ simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
+
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
}
#endif
};
{
simd16scalari result = _simd16_setzero_si();
- simd16scalari src = _simd16_castps_si(_simd16_load_ps(reinterpret_cast<const float *>(pSrc)));
+ simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
simdscalari srclo = _simd16_extract_si(src, 0);
simdscalari srchi = _simd16_extract_si(src, 1);
- result = _simd16_insert_si(result, _mm256_unpacklo_epi16(srclo, srchi), 0);
- result = _simd16_insert_si(result, _mm256_unpackhi_epi16(srclo, srchi), 1);
+ result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0);
+ result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1);
_simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result);
}
#include "core/format_conversion.h"
#include "memory/TilingFunctions.h"
-#include "memory/tilingtraits.h"
#include "memory/Convert.h"
#include "core/multisample.h"
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<8, 4>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
+ {
+ // 8 x 2 bytes = 16 bytes, 16 pixels
+ const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
+
+ uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
+
+ // Unswizzle from SWR-Z order
+ ppDsts16[0][0] = pSrc16[0]; // 0 1
+ ppDsts16[0][1] = pSrc16[2]; // 4 5
+
+ ppDsts16[1][0] = pSrc16[1]; // 2 3
+ ppDsts16[1][1] = pSrc16[3]; // 6 7
+
+ ppDsts16[2][0] = pSrc16[4]; // 8 9
+ ppDsts16[2][1] = pSrc16[6]; // C D
+
+ ppDsts16[3][0] = pSrc16[5]; // A B
+ ppDsts16[3][1] = pSrc16[7]; // E F
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<16, 4>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
+ {
+ // 8 x 4 bytes = 32 bytes, 16 pixels
+ const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
+
+ uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
+
+ // Unswizzle from SWR-Z order
+ ppDsts32[0][0] = pSrc32[0]; // 0 1
+ ppDsts32[0][1] = pSrc32[2]; // 4 5
+
+ ppDsts32[1][0] = pSrc32[1]; // 2 3
+ ppDsts32[1][1] = pSrc32[3]; // 6 7
+
+ ppDsts32[2][0] = pSrc32[4]; // 8 9
+ ppDsts32[2][1] = pSrc32[6]; // C D
+
+ ppDsts32[3][0] = pSrc32[5]; // A B
+ ppDsts32[3][1] = pSrc32[7]; // E F
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
{
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
{
- __m128i quad0 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[0]);
- __m128i quad1 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[1]);
- __m128i quad2 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[2]);
- __m128i quad3 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[3]);
-
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[0]), _mm_unpacklo_epi64(quad0, quad1));
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[1]), _mm_unpackhi_epi64(quad0, quad1));
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[2]), _mm_unpacklo_epi64(quad2, quad3));
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[3]), _mm_unpackhi_epi64(quad2, quad3));
+ // 4 x 16 bytes = 64 bytes, 16 pixels
+ const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+
+ __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+
+ // Unswizzle from SWR-Z order
+ __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
+ __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
+ __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
+ __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
+
+ _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
+ _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
+ _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
+ _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
}
};
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<64, 8>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
+ {
+ // 8 x 16 bytes = 128 bytes, 16 pixels
+ const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+
+ __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+
+ // order of pointers match SWR-Z layout
+ *ppDsts128[0] = pSrc128[0]; // 0 1
+ *ppDsts128[1] = pSrc128[1]; // 2 3
+ *ppDsts128[2] = pSrc128[2]; // 4 5
+ *ppDsts128[3] = pSrc128[3]; // 6 7
+ *ppDsts128[4] = pSrc128[4]; // 8 9
+ *ppDsts128[5] = pSrc128[5]; // A B
+ *ppDsts128[6] = pSrc128[6]; // C D
+ *ppDsts128[7] = pSrc128[7]; // E F
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<128, 16>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
+ {
+ // 16 x 16 bytes = 256 bytes, 16 pixels
+ const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+
+ __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+
+ for (uint32_t i = 0; i < 16; i += 4)
+ {
+ *ppDsts128[i + 0] = pSrc128[i + 0];
+ *ppDsts128[i + 1] = pSrc128[i + 2];
+ *ppDsts128[i + 2] = pSrc128[i + 1];
+ *ppDsts128[i + 3] = pSrc128[i + 3];
+ }
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
//////////////////////////////////////////////////////////////////////////
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
+ static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
+
+ static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
+
+ OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+ // Load hot-tile
+ simd16vector src, dst;
+ LoadSOA<SrcFormat>(pSrc, src);
+
+ // deswizzle
+ dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
+ dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
+ dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
+
+ // clamp
+ dst.x = Clamp<DstFormat>(dst.x, 0);
+ dst.y = Clamp<DstFormat>(dst.y, 1);
+ dst.z = Clamp<DstFormat>(dst.z, 2);
+
+ // normalize
+ dst.x = Normalize<DstFormat>(dst.x, 0);
+ dst.y = Normalize<DstFormat>(dst.y, 1);
+ dst.z = Normalize<DstFormat>(dst.z, 2);
+
+ // pack
+ simd16scalari packed = _simd16_castps_si(dst.x);
+
+ SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
+ SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
+
+ packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
+ packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
+
+ // pack low 16 bits of each 32 bit lane to low 128 bits of dst
+ uint32_t *pPacked = (uint32_t*)&packed;
+ uint16_t *pAosTile = (uint16_t*)&aosTile[0];
+ for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
+ {
+ *pAosTile++ = *pPacked++;
+ }
+
+#else
static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
*pAosTile++ = *pPacked++;
}
+#endif
// Store data into destination
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
}
// Store data into destination but don't overwrite the X8 bits
// Each 4-pixel row is 16-bytes
-#if 1
+
simdscalari loadlo = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile));
simdscalari loadhi = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile + sizeof(simdscalari)));
simdscalari templo = _simd_unpacklo_epi64(loadlo, loadhi);
simdscalari temphi = _simd_unpackhi_epi64(loadlo, loadhi);
- simdscalari destlo = _mm256_loadu2_m128i(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
- simdscalari desthi = _mm256_loadu2_m128i(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
+ simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
+ simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
- simdscalari mask = _simd_set1_epi32(0xFFFFFF);
+ simdscalari mask = _simd_set1_epi32(0x00FFFFFF);
destlo = _simd_or_si(_simd_andnot_si(mask, destlo), _simd_and_si(mask, templo));
desthi = _simd_or_si(_simd_andnot_si(mask, desthi), _simd_and_si(mask, templo));
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo);
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi);
-#else
- __m128i *pZRow01 = (__m128i*)aosTile;
- __m128i vQuad00 = _mm_load_si128(pZRow01);
- __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
- __m128i vQuad02 = _mm_load_si128(pZRow01 + 2);
- __m128i vQuad03 = _mm_load_si128(pZRow01 + 3);
-
- __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
- __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
- __m128i vRow20 = _mm_unpacklo_epi64(vQuad02, vQuad03);
- __m128i vRow30 = _mm_unpackhi_epi64(vQuad02, vQuad03);
-
- __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
- __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
- __m128i vDst2 = _mm_loadu_si128((const __m128i*)ppDsts[2]);
- __m128i vDst3 = _mm_loadu_si128((const __m128i*)ppDsts[3]);
-
- __m128i vMask = _mm_set1_epi32(0xFFFFFF);
-
- vDst0 = _mm_andnot_si128(vMask, vDst0);
- vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
- vDst1 = _mm_andnot_si128(vMask, vDst1);
- vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
- vDst2 = _mm_andnot_si128(vMask, vDst2);
- vDst2 = _mm_or_si128(vDst2, _mm_and_si128(vRow20, vMask));
- vDst3 = _mm_andnot_si128(vMask, vDst3);
- vDst3 = _mm_or_si128(vDst3, _mm_and_si128(vRow10, vMask));
-
- _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
- _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
- _mm_storeu_si128((__m128i*)ppDsts[2], vDst2);
- _mm_storeu_si128((__m128i*)ppDsts[3], vDst3);
-#endif
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo);
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi);
#else
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
- // clamp
+ // clamp
const simd16scalar zero = _simd16_setzero_ps();
const simd16scalar ones = _simd16_set1_ps(1.0f);
comp3 = _simd16_max_ps(comp3, zero);
comp3 = _simd16_min_ps(comp3, ones);
+ // gamma-correct only rgb
if (FormatTraits<DstFormat>::isSRGB)
{
- // Gamma-correct only rgb
comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
}
- // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+ // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
-#if 1
- // SOA to AOS conversion
+ // SOA to AOS conversion
src1 = _simd16_slli_epi32(src1, 8);
src2 = _simd16_slli_epi32(src2, 16);
src3 = _simd16_slli_epi32(src3, 24);
simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
- // de-swizzle conversion
+ // de-swizzle conversion
#if 1
simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
#endif
-#endif
-#if KNOB_ARCH == KNOB_ARCH_AVX
-
- // splitting into two sets of 4 wide integer vector types
- // because AVX doesn't have instructions to support this operation at 8 wide
-#if 0
- __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
- __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
- __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
- __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
-
- __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
- __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
- __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
- __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
-
- srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
- srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
- srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
- srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
- srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
- srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
-
- srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
- srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
-
- srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
- srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
-
- srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
- srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
-
- // unpack into rows that get the tiling order correct
- __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
- __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
-
- __m256i final = _mm256_castsi128_si256(vRow00);
- final = _mm256_insertf128_si256(final, vRow10, 1);
-
-#else
-#if 0
- simd16scalari final = _simd16_setzero_si();
-
-#endif
-#endif
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
- // logic is as above, only wider
-#if 0
- src1 = _simd16_slli_epi32(src1, 8);
- src2 = _simd16_slli_epi32(src2, 16);
- src3 = _simd16_slli_epi32(src3, 24);
-
- simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));
-
- final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
-
-#endif
-#endif
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+ // store 8x2 memory order:
+ // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
+ // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
}
#endif
#endif
#endif
- _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+ _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+}
+
+#if USE_8x2_TILE_BACKEND
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
+{
+ // swizzle rgba -> bgra while we load
+ simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
+ simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
+ simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
+
+ // clamp
+ const simd16scalar zero = _simd16_setzero_ps();
+ const simd16scalar ones = _simd16_set1_ps(1.0f);
+
+ comp0 = _simd16_max_ps(comp0, zero);
+ comp0 = _simd16_min_ps(comp0, ones);
+
+ comp1 = _simd16_max_ps(comp1, zero);
+ comp1 = _simd16_min_ps(comp1, ones);
+
+ comp2 = _simd16_max_ps(comp2, zero);
+ comp2 = _simd16_min_ps(comp2, ones);
+
+ // gamma-correct only rgb
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
+ comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
+ comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
+ }
+
+ // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
+ comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
+ comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+ comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+
+ // moving to 16 wide integer vector types
+ simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
+ simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
+ simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
+
+ // SOA to AOS conversion
+ src1 = _simd16_slli_epi32(src1, 8);
+ src2 = _simd16_slli_epi32(src2, 16);
+
+ simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
+
+ // de-swizzle conversion
+#if 1
+ simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
+ simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
+
+ final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
+
+#else
+ final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
+
+#endif
+ // store 8x2 memory order:
+ // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
+ // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
}
+#endif
template<SWR_FORMAT DstFormat>
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
{
#endif
- _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+ _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
}
template<>
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
#if USE_8x2_TILE_BACKEND
typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
- SimdT* pSrcSimdTiles = (SimdT*)pSrc;
+ SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
// Compute which simd tile we're accessing within 8x8 tile.
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
- SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
+ SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
- uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+#if USE_8x2_TILE_BACKEND
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
+ };
+
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
+ }
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
{
uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
}
+#endif
}
};
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
+ };
+
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
+ }
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
}
+#endif
}
};
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
#if USE_8x2_TILE_BACKEND
- uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch, pDst + (SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL) / 2, pDst + pDstSurface->pitch + (SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL) / 2 };
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM; ++row)
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
{
- uint8_t* ppStartRows[] = { ppRows[0], ppRows[1], ppRows[2], ppRows[3] };
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
+ };
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM; ++col)
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
{
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-
- ppRows[0] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- ppRows[1] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- ppRows[2] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- ppRows[3] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
}
- ppRows[0] = ppStartRows[0] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
- ppRows[1] = ppStartRows[1] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
- ppRows[2] = ppStartRows[2] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
- ppRows[3] = ppStartRows[3] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
}
#else
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
//////////////////////////////////////////////////////////////////////////
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat >
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t MAX_DST_COLUMN_BYTES = 16;
+#if !USE_8x2_TILE_BACKEND
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
+
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
+
+#if 1
+ uint8_t *ppDsts[8];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ }
+ }
+ }
+
+#else
+ uint8_t *ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
+ pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
+ };
+
+#endif
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+#else
uint8_t* ppDsts[] =
{
pDst, // row 0, col 0
ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
}
+#endif
}
};
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t MAX_DST_COLUMN_BYTES = 16;
+#if !USE_8x2_TILE_BACKEND
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // double up on tile y dim, one simd16 tile will do twice the rows
+
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
+
+#if 1
+ uint8_t *ppDsts[16];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ }
+ }
+ }
+
+#else
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
+ pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
+
+ pDst + pDstSurface->pitch * 2, // row 2, col 0
+ pDst + pDstSurface->pitch * 3, // row 3, col 0
+ pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES, // row 2, col 1
+ pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES, // row 3, col 1
+ pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2, // row 2, col 2
+ pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2, // row 3, col 2
+ pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3, // row 2, col 3
+ pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3 // row 3, col 3
+ };
+
+#endif
+#if 1
+ // Raster tile height is quadruple simd16 tile height
+ static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
+
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ // tile rows 0 thru 3
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+
+ // tile rows 4 thru 7
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+#else
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
+ {
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ // Format conversion, convert from SOA to AOS, and store
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+
+#endif
+#else
struct DstPtrs
{
uint8_t* ppDsts[8];
ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
}
+#endif
}
};
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestRowWidthBytes / 4,
+ pDst + DestRowWidthBytes + DestRowWidthBytes / 4
+ };
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += pSrcInc;
}
+#endif
}
};
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestRowWidthBytes / 2,
+ pDst + DestRowWidthBytes + DestRowWidthBytes / 2
+ };
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += pSrcInc;
}
+#endif
}
};
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
// TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
- uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + DestRowWidthBytes, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
+ };
+
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
+ }
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
+ uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
pRow0 += (DestRowWidthBytes * 2);
pRow1 += (DestRowWidthBytes * 2);
}
+#endif
}
};
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD16_TILE_Y_DIM)
- {
- uint8_t *pRow = pCol0 + row * DestRowWidthBytes;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
- uint8_t *ppDsts[] = { pRow, pRow + DestRowWidthBytes, pRow + DestColumnBytes, pRow + DestColumnBytes + DestRowWidthBytes };
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestColumnBytes,
+ pDst + DestRowWidthBytes + DestColumnBytes
+ };
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
}
#else
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+
+#if 1
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ uint8_t *ppDsts[8];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * DestColumnBytes;
+ }
+ }
+ }
+
+#else
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestColumnBytes,
+ pDst + DestRowWidthBytes + DestColumnBytes,
+ pDst + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes + DestColumnBytes * 2,
+ pDst + DestColumnBytes * 3,
+ pDst + DestRowWidthBytes + DestColumnBytes * 3
+ };
+
+#endif
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+#else
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
uint8_t* pCol1 = pCol0 + DestColumnBytes;
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += pSrcInc;
}
+#endif
}
};
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
-
- static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
- static const size_t TILE_Y_ROWS = 32;
- static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
-
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
- static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
- static const size_t MAX_DST_COLUMN_BYTES = 16;
-
- static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+#if USE_8x2_TILE_BACKEND
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+
+#else
+ static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
+ static const size_t TILE_Y_ROWS = 32;
+ static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
+
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t MAX_DST_COLUMN_BYTES = 16;
+
+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
/// @param pSrc - Pointer to raster tile.
SWR_SURFACE_STATE* pDstSurface,
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
{
+#if USE_8x2_TILE_BACKEND
+ static const uint32_t DestRowWidthBytes = 16; // 16B rows
+ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
+#endif
+
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes; // double up on tile y dim, one simd16 tile will do twice the rows
+
+#if 1
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ uint8_t *ppDsts[16];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * DestRowWidthBytes + x * DestColumnBytes;
+ ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * DestRowWidthBytes + x * DestColumnBytes;
+ }
+ }
+ }
+
+#else
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestColumnBytes,
+ pDst + DestRowWidthBytes + DestColumnBytes,
+ pDst + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes + DestColumnBytes * 2,
+ pDst + DestColumnBytes * 3,
+ pDst + DestRowWidthBytes + DestColumnBytes * 3,
+
+ pDst + DestRowWidthBytes * 2,
+ pDst + DestRowWidthBytes * 3,
+ pDst + DestRowWidthBytes * 2 + DestColumnBytes,
+ pDst + DestRowWidthBytes * 3 + DestColumnBytes,
+ pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3,
+ pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3
+ };
+
+#endif
+#if 1
+ // Raster tile height is quadruple simd16 tile height
+ static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
+
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ // tile rows 0 thru 3
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+
+ // tile rows 4 thru 7
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+#else
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+#endif
+#else
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
struct DstPtrs
{
ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
}
+#endif
}
};
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
{
PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
-
for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
{
size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(