#if KNOB_ARCH == KNOB_ARCH_AVX
- // splitting into two sets of 4 wide integer vector types
- // because AVX doesn't have instructions to support this operation at 8 wide
+ // splitting into two sets of 4 wide integer vector types
+ // because AVX doesn't have instructions to support this operation at 8 wide
__m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
__m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
__m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
- // unpack into rows that get the tiling order correct
+ // unpack into rows that get the tiling order correct
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
//////////////////////////////////////////////////////////////////////////
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat >
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
-#if 1
- uint8_t *ppDsts[8];
-
- {
- for (uint32_t y = 0; y < 2; y += 1)
- {
- for (uint32_t x = 0; x < 4; x += 1)
- {
- ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
- }
- }
- }
-
-#else
uint8_t *ppDsts[] =
{
- pDst, // row 0, col 0
- pDst + pDstSurface->pitch, // row 1, col 0
- pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
- pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
- pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
+ pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
};
-#endif
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
// Raster tile width is same as simd16 tile width
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
#if USE_8x2_TILE_BACKEND
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // double up on tile y dim, one simd16 tile will do twice the rows
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
-#if 1
- uint8_t *ppDsts[16];
-
- {
- for (uint32_t y = 0; y < 2; y += 1)
- {
- for (uint32_t x = 0; x < 4; x += 1)
- {
- ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
- ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
- }
- }
- }
-
-#else
uint8_t* ppDsts[] =
{
- pDst, // row 0, col 0
- pDst + pDstSurface->pitch, // row 1, col 0
- pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
- pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
- pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
- pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
-
- pDst + pDstSurface->pitch * 2, // row 2, col 0
- pDst + pDstSurface->pitch * 3, // row 3, col 0
- pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES, // row 2, col 1
- pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES, // row 3, col 1
- pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2, // row 2, col 2
- pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2, // row 3, col 2
- pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3, // row 2, col 3
- pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3 // row 3, col 3
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
+ pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
+ pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4
+ pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5
+ pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6
+ pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7
};
-#endif
-#if 1
- // Raster tile height is quadruple simd16 tile height
- static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
-
- // Raster tile width is same as simd16 tile width
- static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
- // tile rows 0 thru 3
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
- pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
-
- for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
- {
- ppDsts[i] += dy;
- }
-
- // tile rows 4 thru 7
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
-#else
- for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
// Raster tile width is same as simd16 tile width
- static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
- // Format conversion, convert from SOA to AOS, and store
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
ppDsts[i] += dy;
}
}
-
-#endif
#else
struct DstPtrs
{
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
- // There will be 2 x 4-wide columns in an 8x8 raster tile.
#if USE_8x2_TILE_BACKEND
+ // There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
uint8_t *ppDsts[] =
{
pDst,
pDst + DestRowWidthBytes + DestRowWidthBytes / 4
};
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
ppDsts[3] += dy;
}
#else
+ // There will be 8 4x2 simd tiles in an 8x8 raster tile.
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
- // There will be 2 x 4-wide columns in an 8x8 raster tile.
#if USE_8x2_TILE_BACKEND
+ // There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
uint8_t *ppDsts[] =
{
pDst,
pDst + DestRowWidthBytes + DestRowWidthBytes / 2
};
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
ppDsts[3] += dy;
}
#else
+ // There will be 8 4x2 simd tiles in an 8x8 raster tile.
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
{
static const uint32_t DestRowWidthBytes = 512; // 512B rows
- // Punt non-full tiles to generic store
+ // Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
- // There will be 2 x 4-wide columns in an 8x8 raster tile.
#if USE_8x2_TILE_BACKEND
+ // There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
- uint8_t *ppDsts[] =
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ uint8_t *ppDsts[] =
{
- pDst,
- pDst + DestRowWidthBytes,
- pDst + DestColumnBytes,
- pDst + DestRowWidthBytes + DestColumnBytes
+ pDst, // row 0, col 0
+ pDst + DestRowWidthBytes, // row 1, col 0
+ pDst + DestColumnBytes, // row 0, col 1
+ pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1
};
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
ppDsts[3] += dy;
}
#else
+ // There will be 8 4x2 simd tiles in an 8x8 raster tile.
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
- // There will be 2 x 4-wide columns in an 8x8 raster tile.
#if USE_8x2_TILE_BACKEND
+ // There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
- const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-
-#if 1
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
- uint8_t *ppDsts[8];
-
- {
- for (uint32_t y = 0; y < 2; y += 1)
- {
- for (uint32_t x = 0; x < 4; x += 1)
- {
- ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * DestColumnBytes;
- }
- }
- }
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-#else
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
uint8_t *ppDsts[] =
{
- pDst,
- pDst + DestRowWidthBytes,
- pDst + DestColumnBytes,
- pDst + DestRowWidthBytes + DestColumnBytes,
- pDst + DestColumnBytes * 2,
- pDst + DestRowWidthBytes + DestColumnBytes * 2,
- pDst + DestColumnBytes * 3,
- pDst + DestRowWidthBytes + DestColumnBytes * 3
+ pDst, // row 0, col 0
+ pDst + DestRowWidthBytes, // row 1, col 0
+ pDst + DestColumnBytes, // row 0, col 1
+ pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
+ pDst + DestColumnBytes * 2, // row 0, col 2
+ pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
+ pDst + DestColumnBytes * 3, // row 0, col 3
+ pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3
};
-#endif
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
}
}
#else
+ // There will be 8 4x2 simd tiles in an 8x8 raster tile.
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
uint8_t* pCol1 = pCol0 + DestColumnBytes;
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
- if (x + KNOB_TILE_X_DIM > lodWidth ||
- y + KNOB_TILE_Y_DIM > lodHeight)
+
+ if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
{
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+ // We can compute the offsets to each column within the raster tile once and increment from these.
#if USE_8x2_TILE_BACKEND
+ // There will be 4 8x2 simd tiles in an 8x8 raster tile.
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
- const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes; // double up on tile y dim, one simd16 tile will do twice the rows
-
-#if 1
// we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
- uint8_t *ppDsts[16];
-
- {
- for (uint32_t y = 0; y < 2; y += 1)
- {
- for (uint32_t x = 0; x < 4; x += 1)
- {
- ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * DestRowWidthBytes + x * DestColumnBytes;
- ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * DestRowWidthBytes + x * DestColumnBytes;
- }
- }
- }
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
-#else
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
uint8_t *ppDsts[] =
{
- pDst,
- pDst + DestRowWidthBytes,
- pDst + DestColumnBytes,
- pDst + DestRowWidthBytes + DestColumnBytes,
- pDst + DestColumnBytes * 2,
- pDst + DestRowWidthBytes + DestColumnBytes * 2,
- pDst + DestColumnBytes * 3,
- pDst + DestRowWidthBytes + DestColumnBytes * 3,
-
- pDst + DestRowWidthBytes * 2,
- pDst + DestRowWidthBytes * 3,
- pDst + DestRowWidthBytes * 2 + DestColumnBytes,
- pDst + DestRowWidthBytes * 3 + DestColumnBytes,
- pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2,
- pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2,
- pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3,
- pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3
- };
-
-#endif
-#if 1
- // Raster tile height is quadruple simd16 tile height
- static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
-
- // Raster tile width is same as simd16 tile width
- static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
-
- // tile rows 0 thru 3
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-
- pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+ pDst, // row 0, col 0
+ pDst + DestRowWidthBytes, // row 1, col 0
+ pDst + DestColumnBytes, // row 0, col 1
+ pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1
+ pDst + DestColumnBytes * 2, // row 0, col 2
+ pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
+ pDst + DestColumnBytes * 3, // row 0, col 3
+ pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
+ pDst + DestColumnBytes * 4, // row 0, col 4
+ pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
+ pDst + DestColumnBytes * 5, // row 0, col 5
+ pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
+ pDst + DestColumnBytes * 6, // row 0, col 6
+ pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
+ pDst + DestColumnBytes * 7, // row 0, col 7
+ pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7
+ };
- for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
{
- ppDsts[i] += dy;
- }
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
- // tile rows 4 thru 7
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-#else
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
- {
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
ppDsts[i] += dy;
}
}
-#endif
#else
+ // There will be 8 4x2 simd tiles in an 8x8 raster tile.
uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
struct DstPtrs
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
{
PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
+
for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
{
- size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
- 0,
- 0,
- pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
- pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
- sampleNum,
- pDstSurface->lod,
- pDstSurface);
-
- // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
- bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
- (pDstSurface->bInterleavedSamples);
-
- pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
+ size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
+ 0,
+ 0,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
+ pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
+ sampleNum,
+ pDstSurface->lod,
+ pDstSurface);
+
+ // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
+ bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
+ (pDstSurface->bInterleavedSamples);
+
+ pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
}
// Store each raster tile from the hot tile to the destination surface.