}
};
-#if USE_8x2_TILE_BACKEND
template <>
struct StorePixels<8, 4>
{
}
};
-#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
}
};
-#if USE_8x2_TILE_BACKEND
template <>
struct StorePixels<16, 4>
{
}
};
-#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
}
};
-#if USE_8x2_TILE_BACKEND
template <>
struct StorePixels<32, 4>
{
}
};
-#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
}
};
-#if USE_8x2_TILE_BACKEND
template <>
struct StorePixels<64, 8>
{
}
};
-#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
}
};
-#if USE_8x2_TILE_BACKEND
template <>
struct StorePixels<128, 16>
{
}
};
-#endif
//////////////////////////////////////////////////////////////////////////
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
//////////////////////////////////////////////////////////////////////////
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
// Convert from SOA --> AOS
FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
-#else
- static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-
- OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
- OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-
- // Convert from SrcFormat --> DstFormat
- simdvector src;
- LoadSOA<SrcFormat>(pSrc, src);
- StoreSOA<DstFormat>(src, soaTile);
-
- // Convert from SOA --> AOS
- FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
-
-#endif
// Store data into destination
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
}
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
// Convert from SOA --> AOS
FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
-#else
- static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-
- OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-
- // Convert from SOA --> AOS
- FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
-
-#endif
// Store data into destination
StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
}
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
*pAosTile++ = *pPacked++;
}
-#else
- static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
- static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
- static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-
- OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-
- // Load hot-tile
- simdvector src, dst;
- LoadSOA<SrcFormat>(pSrc, src);
-
- // deswizzle
- dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
- dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
- dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
-
- // clamp
- dst.x = Clamp<DstFormat>(dst.x, 0);
- dst.y = Clamp<DstFormat>(dst.y, 1);
- dst.z = Clamp<DstFormat>(dst.z, 2);
-
- // normalize
- dst.x = Normalize<DstFormat>(dst.x, 0);
- dst.y = Normalize<DstFormat>(dst.y, 1);
- dst.z = Normalize<DstFormat>(dst.z, 2);
-
- // pack
- simdscalari packed = _simd_castps_si(dst.x);
- packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
- packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
- FormatTraits<DstFormat>::GetConstBPC(1)));
-
- // pack low 16 bits of each 32 bit lane to low 128 bits of dst
- uint32_t *pPacked = (uint32_t*)&packed;
- uint16_t *pAosTile = (uint16_t*)&aosTile[0];
- for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
- {
- *pAosTile++ = *pPacked++;
- }
-
-#endif
// Store data into destination
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
}
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
// clamp
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
-#else
- static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-
- OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
- OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-
- // Convert from SrcFormat --> DstFormat
- simdvector src;
- LoadSOA<SrcFormat>(pSrc, src);
- StoreSOA<DstFormat>(src, soaTile);
-
- // Convert from SOA --> AOS
- FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
-
- // Store data into destination but don't overwrite the X8 bits
- // Each 4-pixel row is 16-bytes
- simd4scalari *pZRow01 = (simd4scalari*)aosTile;
- simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
- simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
-
- simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
- simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
-
- simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
- simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
-
- simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
-
- vDst0 = SIMD128::andnot_si(vMask, vDst0);
- vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
- vDst1 = SIMD128::andnot_si(vMask, vDst1);
- vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
-
- SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
- SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
-#endif
}
};
-#if USE_8x2_TILE_BACKEND
template<SWR_FORMAT DstFormat>
INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
{
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
}
-#endif
template<SWR_FORMAT DstFormat>
INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
{
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
}
-#if USE_8x2_TILE_BACKEND
template<SWR_FORMAT DstFormat>
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
{
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
}
-#endif
template<SWR_FORMAT DstFormat>
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
{
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
-#if USE_8x2_TILE_BACKEND
FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
-#else
- FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-#endif
}
};
uint32_t x, uint32_t y,
float outputColor[4])
{
-#if USE_8x2_TILE_BACKEND
typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
pSimdTile->GetSwizzledColor(simdOffset, outputColor);
-#else
- typedef SimdTile<SrcFormat, DstFormat> SimdT;
-
- SimdT* pSrcSimdTiles = (SimdT*)pSrc;
-
- // Compute which simd tile we're accessing within 8x8 tile.
- // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
- uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
-
- SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
-
- uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
-
- pSimdTile->GetSwizzledColor(simdOffset, outputColor);
-#endif
}
//////////////////////////////////////////////////////////////////////////
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
- {
- uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
-
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
- {
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-
- ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
- ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
- pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
- }
-
- ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
- ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
- }
-#endif
}
};
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
- {
- uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
-
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
- {
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-
- ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
- ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
- pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
- }
-
- ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
- ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
- }
-#endif
}
};
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
- {
- uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
-
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
- {
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-
- ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
- ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
- pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
- }
-
- ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
- ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
- }
-#endif
}
};
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t MAX_DST_COLUMN_BYTES = 16;
-#if !USE_8x2_TILE_BACKEND
- static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
- static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
ppDsts[i] += dy;
}
}
-#else
- uint8_t* ppDsts[] =
- {
- pDst, // row 0, col 0
- pDst + pDstSurface->pitch, // row 1, col 0
- pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
- };
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
- {
- uint8_t* ppStartRows[] =
- {
- ppDsts[0],
- ppDsts[1],
- ppDsts[2],
- ppDsts[3],
- };
-
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
- {
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
- ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
- ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
- ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
- ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
- pSrc += SRC_COLUMN_BYTES;
- }
-
- ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
- ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
- ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
- ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
- }
-#endif
}
};
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t MAX_DST_COLUMN_BYTES = 16;
-#if !USE_8x2_TILE_BACKEND
- static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
- static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
ppDsts[i] += dy;
}
}
-#else
- struct DstPtrs
- {
- uint8_t* ppDsts[8];
- } ptrs;
-
- // Need 8 pointers, 4 columns of 2 rows each
- for (uint32_t y = 0; y < 2; ++y)
- {
- for (uint32_t x = 0; x < 4; ++x)
- {
- ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
- }
- }
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
- {
- DstPtrs startPtrs = ptrs;
-
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
- {
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
-
- ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
- pSrc += SRC_COLUMN_BYTES;
- }
-
- ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
- ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
- }
-#endif
}
};
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
-#if USE_8x2_TILE_BACKEND
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- // There will be 8 4x2 simd tiles in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
- pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
- // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
- uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
- {
- uint32_t rowOffset = row * DestRowWidthBytes;
-
- uint8_t* pRow = pCol0 + rowOffset;
- uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
-
- ppDsts[0] += DestRowWidthBytes / 4;
- ppDsts[1] += DestRowWidthBytes / 4;
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
- }
-#endif
}
};
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
-#if USE_8x2_TILE_BACKEND
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- // There will be 8 4x2 simd tiles in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
- pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
- // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
- uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
- {
- uint32_t rowOffset = row * DestRowWidthBytes;
-
- uint8_t* pRow = pCol0 + rowOffset;
- uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
-
- ppDsts[0] += DestRowWidthBytes / 2;
- ppDsts[1] += DestRowWidthBytes / 2;
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
- }
-#endif
}
};
// TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
-#if USE_8x2_TILE_BACKEND
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
- pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
- uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
- {
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
- {
- uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
-
- uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
- // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
- pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
- }
-
- pRow0 += (DestRowWidthBytes * 2);
- pRow1 += (DestRowWidthBytes * 2);
- }
-#endif
}
};
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
-#if USE_8x2_TILE_BACKEND
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ppDsts[2] += dy;
ppDsts[3] += dy;
}
-#else
- // There will be 8 4x2 simd tiles in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
- pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-
- // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
- uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
- {
- uint32_t rowOffset = row * DestRowWidthBytes;
-
- uint8_t* pRow = pCol0 + rowOffset;
- uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
-
- ppDsts[0] += DestColumnBytes;
- ppDsts[1] += DestColumnBytes;
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
- }
-#endif
}
};
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
-#if USE_8x2_TILE_BACKEND
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ppDsts[i] += dy;
}
}
-#else
- // There will be 8 4x2 simd tiles in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
- pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
- uint8_t* pCol1 = pCol0 + DestColumnBytes;
-
- // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
- // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
- uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
- {
- uint32_t rowOffset = row * DestRowWidthBytes;
- uint8_t* ppDsts[] =
- {
- pCol0 + rowOffset,
- pCol0 + rowOffset + DestRowWidthBytes,
- pCol1 + rowOffset,
- pCol1 + rowOffset + DestRowWidthBytes,
- };
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
-
- ppDsts[0] += DestColumnBytes * 2;
- ppDsts[1] += DestColumnBytes * 2;
- ppDsts[2] += DestColumnBytes * 2;
- ppDsts[3] += DestColumnBytes * 2;
-
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
- pSrc += pSrcInc;
- }
-#endif
}
};
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
-#if USE_8x2_TILE_BACKEND
- static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-
-#else
- static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
- static const size_t TILE_Y_ROWS = 32;
- static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
-
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
- static const size_t MAX_DST_COLUMN_BYTES = 16;
-
- static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
- static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
/// @param pSrc - Pointer to raster tile.
SWR_SURFACE_STATE* pDstSurface,
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
{
-#if USE_8x2_TILE_BACKEND
static const uint32_t DestRowWidthBytes = 16; // 16B rows
static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
-#endif
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
-#if USE_8x2_TILE_BACKEND
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
ppDsts[i] += dy;
}
}
-#else
- // There will be 8 4x2 simd tiles in an 8x8 raster tile.
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
- pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
- struct DstPtrs
- {
- uint8_t* ppDsts[8];
- } ptrs;
-
- // Need 8 pointers, 4 columns of 2 rows each
- for (uint32_t y = 0; y < 2; ++y)
- {
- for (uint32_t x = 0; x < 4; ++x)
- {
- ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
- }
- }
-
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
- {
- DstPtrs startPtrs = ptrs;
-
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
- {
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
-
- ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
- ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
- pSrc += SRC_COLUMN_BYTES;
- }
-
- ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
- ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
- }
-#endif
}
};