From: Tim Rowley Date: Mon, 12 Dec 2016 18:50:18 +0000 (-0600) Subject: swr: [rasterizer core/memory] StoreTile: AVX512 progress X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=75149088bea168a10f47df08fc62bcfeed744ce9;p=mesa.git swr: [rasterizer core/memory] StoreTile: AVX512 progress Fixes to 128-bit formats. Reviwed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 8f968649a57..a2365757a7f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -421,8 +421,8 @@ struct Transpose32_32_32_32 vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1)); - _simd16_store_ps(reinterpret_cast(pDst) + 32, reinterpret_cast(vDst)[2]); - _simd16_store_ps(reinterpret_cast(pDst) + 48, reinterpret_cast(vDst)[3]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, reinterpret_cast(vDst)[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, reinterpret_cast(vDst)[1]); } #endif }; @@ -474,8 +474,8 @@ struct Transpose32_32_32 vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1)); - _simd16_store_ps(reinterpret_cast(pDst) + 32, reinterpret_cast(vDst)[2]); - _simd16_store_ps(reinterpret_cast(pDst) + 48, reinterpret_cast(vDst)[3]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, reinterpret_cast(vDst)[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, reinterpret_cast(vDst)[1]); } #endif }; diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index 4fa6683e42b..473ebaef751 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -899,8 +899,8 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_ #if KNOB_ARCH == KNOB_ARCH_AVX - // splitting into two sets of 4 wide integer vector types - // because AVX doesn't have instructions to support this operation at 8 wide + // splitting into two sets of 4 wide integer vector types + // because AVX doesn't have instructions to support this operation at 8 wide __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b @@ -921,7 +921,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr - // unpack into rows that get the tiling order correct + // unpack into rows that get the tiling order correct __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); @@ -1169,8 +1169,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } @@ -1256,8 +1256,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } @@ -1343,8 +1343,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } @@ -1410,7 +1410,7 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat ////////////////////////////////////////////////////////////////////////// /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp ////////////////////////////////////////////////////////////////////////// -template +template struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> { typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; @@ -1435,8 +1435,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } @@ -1451,33 +1451,18 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); -#if 1 - uint8_t *ppDsts[8]; - - { - for (uint32_t y = 0; y < 2; y += 1) - { - for (uint32_t x = 0; x < 4; x += 1) - { - ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; - } - } - } - -#else uint8_t *ppDsts[] = { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 - pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 - pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 + pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 + pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 }; -#endif for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { // Raster tile width is same as simd16 tile width @@ -1560,8 +1545,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstForma // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } @@ -1571,75 +1556,36 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstForma #if USE_8x2_TILE_BACKEND const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // double up on tile y dim, one simd16 tile will do twice the rows + const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); -#if 1 - uint8_t *ppDsts[16]; - - { - for (uint32_t y = 0; y < 2; y += 1) - { - for (uint32_t x = 0; x < 4; x += 1) - { - ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; - ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; - } - } - } - -#else uint8_t* ppDsts[] = { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 - pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 - pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 - - pDst + pDstSurface->pitch * 2, // row 2, col 0 - pDst + pDstSurface->pitch * 3, // row 3, col 0 - pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES, // row 2, col 1 - pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES, // row 3, col 1 - pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2, // row 2, col 2 - pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2, // row 3, col 2 - pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3, // row 2, col 3 - pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3 // row 3, col 3 + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 + pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 + pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 + pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4 + pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5 + pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6 + pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7 }; -#endif -#if 1 - // Raster tile height is quadruple simd16 tile height - static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim"); - - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - // tile rows 0 thru 3 - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; - - for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) - { - ppDsts[i] += dy; - } - - // tile rows 4 thru 7 - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - -#else - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2) + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - // Format conversion, convert from SOA to AOS, and store ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; @@ -1649,8 +1595,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstForma ppDsts[i] += dy; } } - -#endif #else struct DstPtrs { @@ -1723,21 +1667,22 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Dst // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. #if USE_8x2_TILE_BACKEND + // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. uint8_t *ppDsts[] = { pDst, @@ -1746,9 +1691,11 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Dst pDst + DestRowWidthBytes + DestRowWidthBytes / 4 }; - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; @@ -1759,6 +1706,7 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Dst ppDsts[3] += dy; } #else + // There will be 8 4x2 simd tiles in an 8x8 raster tile. uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1810,21 +1758,22 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. #if USE_8x2_TILE_BACKEND + // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. uint8_t *ppDsts[] = { pDst, @@ -1833,9 +1782,11 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds pDst + DestRowWidthBytes + DestRowWidthBytes / 2 }; - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; @@ -1846,6 +1797,7 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds ppDsts[3] += dy; } #else + // There will be 8 4x2 simd tiles in an 8x8 raster tile. uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1895,11 +1847,11 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds { static const uint32_t DestRowWidthBytes = 512; // 512B rows - // Punt non-full tiles to generic store + // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } @@ -1990,32 +1942,36 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. #if USE_8x2_TILE_BACKEND + // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - uint8_t *ppDsts[] = + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + uint8_t *ppDsts[] = { - pDst, - pDst + DestRowWidthBytes, - pDst + DestColumnBytes, - pDst + DestRowWidthBytes + DestColumnBytes + pDst, // row 0, col 0 + pDst + DestRowWidthBytes, // row 1, col 0 + pDst + DestColumnBytes, // row 0, col 1 + pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1 }; - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; @@ -2026,6 +1982,7 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds ppDsts[3] += dy; } #else + // There will be 8 4x2 simd tiles in an 8x8 raster tile. uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -2078,52 +2035,40 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. #if USE_8x2_TILE_BACKEND + // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - -#if 1 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - uint8_t *ppDsts[8]; - - { - for (uint32_t y = 0; y < 2; y += 1) - { - for (uint32_t x = 0; x < 4; x += 1) - { - ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * DestColumnBytes; - } - } - } + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; -#else + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. uint8_t *ppDsts[] = { - pDst, - pDst + DestRowWidthBytes, - pDst + DestColumnBytes, - pDst + DestRowWidthBytes + DestColumnBytes, - pDst + DestColumnBytes * 2, - pDst + DestRowWidthBytes + DestColumnBytes * 2, - pDst + DestColumnBytes * 3, - pDst + DestRowWidthBytes + DestColumnBytes * 3 + pDst, // row 0, col 0 + pDst + DestRowWidthBytes, // row 1, col 0 + pDst + DestColumnBytes, // row 0, col 1 + pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 + pDst + DestColumnBytes * 2, // row 0, col 2 + pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 + pDst + DestColumnBytes * 3, // row 0, col 3 + pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3 }; -#endif - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; @@ -2134,6 +2079,7 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds } } #else + // There will be 8 4x2 simd tiles in an 8x8 raster tile. uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); uint8_t* pCol1 = pCol0 + DestColumnBytes; @@ -2210,79 +2156,48 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, D // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) + + if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) { return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. #if USE_8x2_TILE_BACKEND + // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes; // double up on tile y dim, one simd16 tile will do twice the rows - -#if 1 // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) - uint8_t *ppDsts[16]; - - { - for (uint32_t y = 0; y < 2; y += 1) - { - for (uint32_t x = 0; x < 4; x += 1) - { - ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * DestRowWidthBytes + x * DestColumnBytes; - ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * DestRowWidthBytes + x * DestColumnBytes; - } - } - } + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; -#else + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. uint8_t *ppDsts[] = { - pDst, - pDst + DestRowWidthBytes, - pDst + DestColumnBytes, - pDst + DestRowWidthBytes + DestColumnBytes, - pDst + DestColumnBytes * 2, - pDst + DestRowWidthBytes + DestColumnBytes * 2, - pDst + DestColumnBytes * 3, - pDst + DestRowWidthBytes + DestColumnBytes * 3, - - pDst + DestRowWidthBytes * 2, - pDst + DestRowWidthBytes * 3, - pDst + DestRowWidthBytes * 2 + DestColumnBytes, - pDst + DestRowWidthBytes * 3 + DestColumnBytes, - pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2, - pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2, - pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3, - pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3 - }; - -#endif -#if 1 - // Raster tile height is quadruple simd16 tile height - static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim"); - - // Raster tile width is same as simd16 tile width - static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - - // tile rows 0 thru 3 - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - - pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + pDst, // row 0, col 0 + pDst + DestRowWidthBytes, // row 1, col 0 + pDst + DestColumnBytes, // row 0, col 1 + pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 + pDst + DestColumnBytes * 2, // row 0, col 2 + pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 + pDst + DestColumnBytes * 3, // row 0, col 3 + pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3 + pDst + DestColumnBytes * 4, // row 0, col 4 + pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4 + pDst + DestColumnBytes * 5, // row 0, col 5 + pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5 + pDst + DestColumnBytes * 6, // row 0, col 6 + pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6 + pDst + DestColumnBytes * 7, // row 0, col 7 + pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7 + }; - for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) { - ppDsts[i] += dy; - } + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); - // tile rows 4 thru 7 - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -#else - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2) - { ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; @@ -2292,8 +2207,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, D ppDsts[i] += dy; } } -#endif #else + // There will be 8 4x2 simd tiles in an 8x8 raster tile. uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); struct DstPtrs @@ -2389,22 +2304,23 @@ struct StoreMacroTile uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) { PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; + for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) { - size_t dstSurfAddress = (size_t)ComputeSurfaceAddress( - 0, - 0, - pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces - pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays - sampleNum, - pDstSurface->lod, - pDstSurface); - - // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear - bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || - (pDstSurface->bInterleavedSamples); - - pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile::Store : OptStoreRasterTile::Store; + size_t dstSurfAddress = (size_t)ComputeSurfaceAddress( + 0, + 0, + pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces + pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays + sampleNum, + pDstSurface->lod, + pDstSurface); + + // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear + bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || + (pDstSurface->bInterleavedSamples); + + pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile::Store : OptStoreRasterTile::Store; } // Store each raster tile from the hot tile to the destination surface.