From: Tim Rowley Date: Fri, 7 Oct 2016 17:07:07 +0000 (-0500) Subject: swr: [rasterizer memory] split load/store for compile speed X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2550b04179614da4c71dbef195d06a7f53273438;p=mesa.git swr: [rasterizer memory] split load/store for compile speed Signed-off-by: Tim Rowley --- diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index e0ea161ee5d..0ade8467178 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -121,6 +121,18 @@ MEMORY_CXX_SOURCES := \ rasterizer/memory/ClearTile.cpp \ rasterizer/memory/Convert.h \ rasterizer/memory/LoadTile.cpp \ + rasterizer/memory/LoadTile.h \ + rasterizer/memory/LoadTile_Linear.cpp \ + rasterizer/memory/LoadTile_TileX.cpp \ + rasterizer/memory/LoadTile_TileY.cpp \ rasterizer/memory/StoreTile.cpp \ + rasterizer/memory/StoreTile.h \ + rasterizer/memory/StoreTile_Linear2.cpp \ + rasterizer/memory/StoreTile_Linear.cpp \ + rasterizer/memory/StoreTile_TileW.cpp \ + rasterizer/memory/StoreTile_TileX2.cpp \ + rasterizer/memory/StoreTile_TileX.cpp \ + rasterizer/memory/StoreTile_TileY2.cpp \ + rasterizer/memory/StoreTile_TileY.cpp \ rasterizer/memory/TilingFunctions.h \ rasterizer/memory/tilingtraits.h diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp index 24cfaefec93..1bc6ac22dfd 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,125 +25,7 @@ * @brief Functionality for Load * ******************************************************************************/ -#include "common/os.h" -#include "common/formats.h" -#include "core/context.h" -#include "core/rdtsc_core.h" -#include "memory/TilingFunctions.h" -#include "memory/tilingtraits.h" -#include "memory/Convert.h" - -typedef void(*PFN_LOAD_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t); - -////////////////////////////////////////////////////////////////////////// -/// Load Raster Tile Function Tables. -////////////////////////////////////////////////////////////////////////// -static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; -static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; - -static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; -static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; - -static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; - -////////////////////////////////////////////////////////////////////////// -/// LoadRasterTile -////////////////////////////////////////////////////////////////////////// -template -struct LoadRasterTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from hot tile source which is always float. - /// @param pSrc - Pointer to raster tile. - /// @param x, y - Coordinates to raster tile. - /// @param output - output color - INLINE static void SetSwizzledDstColor( - const float srcColor[4], - uint32_t x, uint32_t y, - uint8_t* pDst) - { - typedef SimdTile SimdT; - - SimdT* pDstSimdTiles = (SimdT*)pDst; - - // Compute which simd tile we're accessing within 8x8 tile. - // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. - uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); - - SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; - - uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); - - pSimdTile->SetSwizzledColor(simdOffset, srcColor); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Loads an 8x8 raster tile from the src surface. - /// @param pSrcSurface - Src surface state - /// @param pDst - Destination hot tile pointer - /// @param x, y - Coordinates to raster tile. - INLINE static void Load( - const SWR_SURFACE_STATE* pSrcSurface, - uint8_t* pDst, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. - { - uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod; - uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod; - - // For each raster tile pixel (rx, ry) - for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) - { - for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) - { - if (((x + rx) < lodWidth) && - ((y + ry) < lodHeight)) - { - uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex, - pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, - pSrcSurface->lod, pSrcSurface); - - float srcColor[4]; - ConvertPixelToFloat(srcColor, pSrc); - - // store pixel to hottile - SetSwizzledDstColor(srcColor, rx, ry, pDst); - } - } - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// LoadMacroTile - Loads a macro tile which consists of raster tiles. -////////////////////////////////////////////////////////////////////////// -template -struct LoadMacroTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Load a macrotile to the destination surface. - /// @param pSrc - Pointer to macro tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void Load( - const SWR_SURFACE_STATE* pSrcSurface, - uint8_t *pDstHotTile, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) - { - // Load each raster tile from the hot tile to the destination surface. - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++) - { - LoadRasterTile::Load(pSrcSurface, pDstHotTile, - (x + col), (y + row), sampleNum, renderTargetArrayIndex); - pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); - } - } - } - } -}; +#include "LoadTile.h" static void BUCKETS_START(UINT id) @@ -276,123 +158,9 @@ void LoadHotTile( } -////////////////////////////////////////////////////////////////////////// -/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. -#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \ - memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \ - \ - sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT] = LoadMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT] = LoadMacroTile, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT] = LoadMacroTile, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT] = LoadMacroTile, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT] = LoadMacroTile, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32B32_SINT] = LoadMacroTile, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32B32_UINT] = LoadMacroTile, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM] = LoadMacroTile, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM] = LoadMacroTile, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT] = LoadMacroTile, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT] = LoadMacroTile, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT] = LoadMacroTile, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32_FLOAT] = LoadMacroTile, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32_SINT] = LoadMacroTile, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32G32_UINT] = LoadMacroTile, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM] = LoadMacroTile, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT] = LoadMacroTile, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM] = LoadMacroTile, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB] = LoadMacroTile, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM] = LoadMacroTile, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB] = LoadMacroTile, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT] = LoadMacroTile, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM] = LoadMacroTile, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB] = LoadMacroTile, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM] = LoadMacroTile, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT] = LoadMacroTile, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT] = LoadMacroTile, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16_UNORM] = LoadMacroTile, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16_SNORM] = LoadMacroTile, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16_SINT] = LoadMacroTile, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16_UINT] = LoadMacroTile, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16_FLOAT] = LoadMacroTile, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM] = LoadMacroTile, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB] = LoadMacroTile, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT] = LoadMacroTile, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32_SINT] = LoadMacroTile, R32_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32_UINT] = LoadMacroTile, R32_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[A32_FLOAT] = LoadMacroTile, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM] = LoadMacroTile, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB] = LoadMacroTile, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM] = LoadMacroTile, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB] = LoadMacroTile, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM] = LoadMacroTile, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B5G6R5_UNORM] = LoadMacroTile, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB] = LoadMacroTile, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM] = LoadMacroTile, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB] = LoadMacroTile, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM] = LoadMacroTile, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB] = LoadMacroTile, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8_UNORM] = LoadMacroTile, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8_SNORM] = LoadMacroTile, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8_SINT] = LoadMacroTile, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8_UINT] = LoadMacroTile, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16_UNORM] = LoadMacroTile, R16_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16_SNORM] = LoadMacroTile, R16_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16_SINT] = LoadMacroTile, R16_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16_UINT] = LoadMacroTile, R16_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16_FLOAT] = LoadMacroTile, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[A16_UNORM] = LoadMacroTile, A16_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[A16_FLOAT] = LoadMacroTile, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM] = LoadMacroTile, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB] = LoadMacroTile, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8_UNORM] = LoadMacroTile, R8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8_SNORM] = LoadMacroTile, R8_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8_SINT] = LoadMacroTile, R8_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8_UINT] = LoadMacroTile, R8_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[A8_UNORM] = LoadMacroTile, A8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC1_UNORM] = LoadMacroTile, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC2_UNORM] = LoadMacroTile, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC3_UNORM] = LoadMacroTile, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC4_UNORM] = LoadMacroTile, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC5_UNORM] = LoadMacroTile, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB] = LoadMacroTile, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB] = LoadMacroTile, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB] = LoadMacroTile, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8_UNORM] = LoadMacroTile, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8_SNORM] = LoadMacroTile, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC4_SNORM] = LoadMacroTile, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[BC5_SNORM] = LoadMacroTile, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT] = LoadMacroTile, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16_UNORM] = LoadMacroTile, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16_SNORM] = LoadMacroTile, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB] = LoadMacroTile, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16_UINT] = LoadMacroTile, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R16G16B16_SINT] = LoadMacroTile, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM] = LoadMacroTile, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT] = LoadMacroTile, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM] = LoadMacroTile, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT] = LoadMacroTile, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT] = LoadMacroTile, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8_UINT] = LoadMacroTile, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \ - sLoadTilesColorTable_##tilemode[R8G8B8_SINT] = LoadMacroTile, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \ - -////////////////////////////////////////////////////////////////////////// -/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. -#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \ - memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \ - \ - sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile, R16_UNORM, R32_FLOAT>::Load; \ - sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32_FLOAT>::Load; \ - sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \ - -////////////////////////////////////////////////////////////////////////// -/// @brief Sets up tables for LoadTile void InitSimLoadTilesTable() { - INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE); - INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE); - - INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR); - INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR); - - INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR); + InitLoadTilesTable_Linear(); + InitLoadTilesTable_XMajor(); + InitLoadTilesTable_YMajor(); } diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h new file mode 100644 index 00000000000..3807f4dedeb --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h @@ -0,0 +1,267 @@ +/**************************************************************************** +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file LoadTile.h +* +* @brief Functionality for Load +* +******************************************************************************/ +#include "common/os.h" +#include "common/formats.h" +#include "core/context.h" +#include "core/rdtsc_core.h" +#include "memory/TilingFunctions.h" +#include "memory/tilingtraits.h" +#include "memory/Convert.h" + +typedef void(*PFN_LOAD_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t); +typedef void(*PFN_LOAD_RASTER_TILES)(const SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t); + +////////////////////////////////////////////////////////////////////////// +/// Load Raster Tile Function Tables. +////////////////////////////////////////////////////////////////////////// +extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; +extern PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; + +extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; +extern PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; + +extern PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; + +void InitLoadTilesTable_Linear(); +void InitLoadTilesTable_XMajor(); +void InitLoadTilesTable_YMajor(); + +////////////////////////////////////////////////////////////////////////// +/// LoadRasterTile +////////////////////////////////////////////////////////////////////////// +template +struct LoadRasterTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from hot tile source which is always float. + /// @param pSrc - Pointer to raster tile. + /// @param x, y - Coordinates to raster tile. + /// @param output - output color + INLINE static void SetSwizzledDstColor( + const float srcColor[4], + uint32_t x, uint32_t y, + uint8_t* pDst) + { + typedef SimdTile SimdT; + + SimdT* pDstSimdTiles = (SimdT*)pDst; + + // Compute which simd tile we're accessing within 8x8 tile. + // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. + uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); + + SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; + + uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); + + pSimdTile->SetSwizzledColor(simdOffset, srcColor); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Loads an 8x8 raster tile from the src surface. + /// @param pSrcSurface - Src surface state + /// @param pDst - Destination hot tile pointer + /// @param x, y - Coordinates to raster tile. + INLINE static void Load( + const SWR_SURFACE_STATE* pSrcSurface, + uint8_t* pDst, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. + { + uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod; + uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod; + + // For each raster tile pixel (rx, ry) + for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) + { + for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) + { + if (((x + rx) < lodWidth) && + ((y + ry) < lodHeight)) + { + uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex, + pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, + pSrcSurface->lod, pSrcSurface); + + float srcColor[4]; + ConvertPixelToFloat(srcColor, pSrc); + + // store pixel to hottile + SetSwizzledDstColor(srcColor, rx, ry, pDst); + } + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// LoadMacroTile - Loads a macro tile which consists of raster tiles. +////////////////////////////////////////////////////////////////////////// +template +struct LoadMacroTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Load a macrotile to the destination surface. + /// @param pSrc - Pointer to macro tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void Load( + const SWR_SURFACE_STATE* pSrcSurface, + uint8_t *pDstHotTile, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) + { + PFN_LOAD_RASTER_TILES loadRasterTileFn; + loadRasterTileFn = LoadRasterTile::Load; + + // Load each raster tile from the hot tile to the destination surface. + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++) + { + loadRasterTileFn(pSrcSurface, pDstHotTile, (x + col), (y + row), sampleNum, renderTargetArrayIndex); + pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); + } + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// InitLoadTileColorTable - Helper function for setting up the tables. +template +static INLINE void InitLoadTileColorTable(PFN_LOAD_TILES (&table)[NUM_SWR_FORMATS]) +{ + memset(table, 0, sizeof(table)); + + table[R32G32B32A32_FLOAT] = LoadMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R32G32B32A32_SINT] = LoadMacroTile, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; + table[R32G32B32A32_UINT] = LoadMacroTile, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; + table[R32G32B32X32_FLOAT] = LoadMacroTile, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R32G32B32_FLOAT] = LoadMacroTile, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R32G32B32_SINT] = LoadMacroTile, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; + table[R32G32B32_UINT] = LoadMacroTile, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; + table[R16G16B16A16_UNORM] = LoadMacroTile, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16B16A16_SNORM] = LoadMacroTile, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16B16A16_SINT] = LoadMacroTile, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; + table[R16G16B16A16_UINT] = LoadMacroTile, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; + table[R16G16B16A16_FLOAT] = LoadMacroTile, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R32G32_FLOAT] = LoadMacroTile, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R32G32_SINT] = LoadMacroTile, R32G32_SINT, R32G32B32A32_FLOAT>::Load; + table[R32G32_UINT] = LoadMacroTile, R32G32_UINT, R32G32B32A32_FLOAT>::Load; + table[R16G16B16X16_UNORM] = LoadMacroTile, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16B16X16_FLOAT] = LoadMacroTile, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; + table[B8G8R8A8_UNORM] = LoadMacroTile, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; + table[B8G8R8A8_UNORM_SRGB] = LoadMacroTile, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R10G10B10A2_UNORM] = LoadMacroTile, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; + table[R10G10B10A2_UNORM_SRGB] = LoadMacroTile, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R10G10B10A2_UINT] = LoadMacroTile, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; + table[R8G8B8A8_UNORM] = LoadMacroTile, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8B8A8_UNORM_SRGB] = LoadMacroTile, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R8G8B8A8_SNORM] = LoadMacroTile, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8B8A8_SINT] = LoadMacroTile, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; + table[R8G8B8A8_UINT] = LoadMacroTile, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; + table[R16G16_UNORM] = LoadMacroTile, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16_SNORM] = LoadMacroTile, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16_SINT] = LoadMacroTile, R16G16_SINT, R32G32B32A32_FLOAT>::Load; + table[R16G16_UINT] = LoadMacroTile, R16G16_UINT, R32G32B32A32_FLOAT>::Load; + table[R16G16_FLOAT] = LoadMacroTile, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; + table[B10G10R10A2_UNORM] = LoadMacroTile, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; + table[B10G10R10A2_UNORM_SRGB] = LoadMacroTile, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R11G11B10_FLOAT] = LoadMacroTile, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R32_SINT] = LoadMacroTile, R32_SINT, R32G32B32A32_FLOAT>::Load; + table[R32_UINT] = LoadMacroTile, R32_UINT, R32G32B32A32_FLOAT>::Load; + table[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32G32B32A32_FLOAT>::Load; + table[A32_FLOAT] = LoadMacroTile, A32_FLOAT, R32G32B32A32_FLOAT>::Load; + table[B8G8R8X8_UNORM] = LoadMacroTile, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; + table[B8G8R8X8_UNORM_SRGB] = LoadMacroTile, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R8G8B8X8_UNORM] = LoadMacroTile, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8B8X8_UNORM_SRGB] = LoadMacroTile, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[B10G10R10X2_UNORM] = LoadMacroTile, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; + table[B5G6R5_UNORM] = LoadMacroTile, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; + table[B5G6R5_UNORM_SRGB] = LoadMacroTile, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[B5G5R5A1_UNORM] = LoadMacroTile, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; + table[B5G5R5A1_UNORM_SRGB] = LoadMacroTile, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[B4G4R4A4_UNORM] = LoadMacroTile, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; + table[B4G4R4A4_UNORM_SRGB] = LoadMacroTile, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R8G8_UNORM] = LoadMacroTile, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8_SNORM] = LoadMacroTile, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8_SINT] = LoadMacroTile, R8G8_SINT, R32G32B32A32_FLOAT>::Load; + table[R8G8_UINT] = LoadMacroTile, R8G8_UINT, R32G32B32A32_FLOAT>::Load; + table[R16_UNORM] = LoadMacroTile, R16_UNORM, R32G32B32A32_FLOAT>::Load; + table[R16_SNORM] = LoadMacroTile, R16_SNORM, R32G32B32A32_FLOAT>::Load; + table[R16_SINT] = LoadMacroTile, R16_SINT, R32G32B32A32_FLOAT>::Load; + table[R16_UINT] = LoadMacroTile, R16_UINT, R32G32B32A32_FLOAT>::Load; + table[R16_FLOAT] = LoadMacroTile, R16_FLOAT, R32G32B32A32_FLOAT>::Load; + table[A16_UNORM] = LoadMacroTile, A16_UNORM, R32G32B32A32_FLOAT>::Load; + table[A16_FLOAT] = LoadMacroTile, A16_FLOAT, R32G32B32A32_FLOAT>::Load; + table[B5G5R5X1_UNORM] = LoadMacroTile, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; + table[B5G5R5X1_UNORM_SRGB] = LoadMacroTile, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R8_UNORM] = LoadMacroTile, R8_UNORM, R32G32B32A32_FLOAT>::Load; + table[R8_SNORM] = LoadMacroTile, R8_SNORM, R32G32B32A32_FLOAT>::Load; + table[R8_SINT] = LoadMacroTile, R8_SINT, R32G32B32A32_FLOAT>::Load; + table[R8_UINT] = LoadMacroTile, R8_UINT, R32G32B32A32_FLOAT>::Load; + table[A8_UNORM] = LoadMacroTile, A8_UNORM, R32G32B32A32_FLOAT>::Load; + table[BC1_UNORM] = LoadMacroTile, BC1_UNORM, R32G32B32A32_FLOAT>::Load; + table[BC2_UNORM] = LoadMacroTile, BC2_UNORM, R32G32B32A32_FLOAT>::Load; + table[BC3_UNORM] = LoadMacroTile, BC3_UNORM, R32G32B32A32_FLOAT>::Load; + table[BC4_UNORM] = LoadMacroTile, BC4_UNORM, R32G32B32A32_FLOAT>::Load; + table[BC5_UNORM] = LoadMacroTile, BC5_UNORM, R32G32B32A32_FLOAT>::Load; + table[BC1_UNORM_SRGB] = LoadMacroTile, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[BC2_UNORM_SRGB] = LoadMacroTile, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[BC3_UNORM_SRGB] = LoadMacroTile, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R8G8B8_UNORM] = LoadMacroTile, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8B8_SNORM] = LoadMacroTile, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; + table[BC4_SNORM] = LoadMacroTile, BC4_SNORM, R32G32B32A32_FLOAT>::Load; + table[BC5_SNORM] = LoadMacroTile, BC5_SNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16B16_FLOAT] = LoadMacroTile, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; + table[R16G16B16_UNORM] = LoadMacroTile, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; + table[R16G16B16_SNORM] = LoadMacroTile, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; + table[R8G8B8_UNORM_SRGB] = LoadMacroTile, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; + table[R16G16B16_UINT] = LoadMacroTile, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; + table[R16G16B16_SINT] = LoadMacroTile, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; + table[R10G10B10A2_SNORM] = LoadMacroTile, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; + table[R10G10B10A2_SINT] = LoadMacroTile, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; + table[B10G10R10A2_SNORM] = LoadMacroTile, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; + table[B10G10R10A2_UINT] = LoadMacroTile, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; + table[B10G10R10A2_SINT] = LoadMacroTile, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; + table[R8G8B8_UINT] = LoadMacroTile, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; + table[R8G8B8_SINT] = LoadMacroTile, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; +} + +////////////////////////////////////////////////////////////////////////// +/// InitLoadTileColorTable - Helper function for setting up the tables. +template +static INLINE void InitLoadTileDepthTable(PFN_LOAD_TILES(&table)[NUM_SWR_FORMATS]) +{ + memset(table, 0, sizeof(table)); + + table[R16_UNORM] = LoadMacroTile, R16_UNORM, R32_FLOAT>::Load; \ + table[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32_FLOAT>::Load; \ + table[R24_UNORM_X8_TYPELESS] = LoadMacroTile, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp new file mode 100644 index 00000000000..5f53b5b6b56 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_Linear.cpp @@ -0,0 +1,39 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file LoadTile.cpp +* +* @brief Functionality for Load +* +******************************************************************************/ +#include "LoadTile.h" + +PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; +PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; + +////////////////////////////////////////////////////////////////////////// +/// @brief Sets up tables for LoadTile +void InitLoadTilesTable_Linear() +{ + InitLoadTileColorTable(sLoadTilesColorTable_SWR_TILE_NONE); + InitLoadTileDepthTable(sLoadTilesDepthTable_SWR_TILE_NONE); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp new file mode 100644 index 00000000000..8e76655ff11 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileX.cpp @@ -0,0 +1,37 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file LoadTile.cpp +* +* @brief Functionality for Load +* +******************************************************************************/ +#include "LoadTile.h" + +PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; + +////////////////////////////////////////////////////////////////////////// +/// @brief Sets up tables for LoadTile +void InitLoadTilesTable_XMajor() +{ + InitLoadTileColorTable(sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp new file mode 100644 index 00000000000..c136392eb78 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile_TileY.cpp @@ -0,0 +1,39 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file LoadTile.cpp +* +* @brief Functionality for Load +* +******************************************************************************/ +#include "LoadTile.h" + +PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; +PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; + +////////////////////////////////////////////////////////////////////////// +/// @brief Sets up tables for LoadTile +void InitLoadTilesTable_YMajor() +{ + InitLoadTileColorTable(sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR); + InitLoadTileDepthTable(sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp index 8a26ff63595..c46dc8876e9 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,1458 +25,13 @@ * @brief Functionality for Store. * ******************************************************************************/ -#include "common/os.h" -#include "common/formats.h" -#include "core/context.h" -#include "core/rdtsc_core.h" -#include "core/format_conversion.h" - -#include "memory/TilingFunctions.h" -#include "memory/tilingtraits.h" -#include "memory/Convert.h" -#include "core/multisample.h" - -#include -#include - -typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); - +#include "StoreTile.h" ////////////////////////////////////////////////////////////////////////// /// Store Raster Tile Function Tables. ////////////////////////////////////////////////////////////////////////// -static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template -struct StorePixels -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<8, 2> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) - { - // Each 4-pixel row is 4 bytes. - const uint16_t* pPixSrc = (const uint16_t*)pSrc; - - // Unswizzle from SWR-Z order - uint16_t* pRow = (uint16_t*)ppDsts[0]; - pRow[0] = pPixSrc[0]; - pRow[1] = pPixSrc[2]; - - pRow = (uint16_t*)ppDsts[1]; - pRow[0] = pPixSrc[1]; - pRow[1] = pPixSrc[3]; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<16, 2> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) - { - // Each 4-pixel row is 8 bytes. - const uint32_t* pPixSrc = (const uint32_t*)pSrc; - - // Unswizzle from SWR-Z order - uint32_t* pRow = (uint32_t*)ppDsts[0]; - pRow[0] = pPixSrc[0]; - pRow[1] = pPixSrc[2]; - - pRow = (uint32_t*)ppDsts[1]; - pRow[0] = pPixSrc[1]; - pRow[1] = pPixSrc[3]; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<32, 2> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) - { - // Each 4-pixel row is 16-bytes - __m128i *pZRow01 = (__m128i*)pSrc; - __m128i vQuad00 = _mm_load_si128(pZRow01); - __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); - - __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); - __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); - - _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); - _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<64, 4> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) - { - // Each 4-pixel row is 32 bytes. - const __m128i* pPixSrc = (const __m128i*)pSrc; - - // order of pointers match SWR-Z layout - __m128i** pvDsts = (__m128i**)&ppDsts[0]; - *pvDsts[0] = pPixSrc[0]; - *pvDsts[1] = pPixSrc[1]; - *pvDsts[2] = pPixSrc[2]; - *pvDsts[3] = pPixSrc[3]; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StorePixels (32-bit pixel specialization) -/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -/// @param ppDsts - Array of destination pointers. Each pointer is -/// to a single row of at most 16B. -/// @tparam NumDests - Number of destination pointers. Each pair of -/// pointers is for a 16-byte column of two rows. -////////////////////////////////////////////////////////////////////////// -template <> -struct StorePixels<128, 8> -{ - static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) - { - // Each 4-pixel row is 64 bytes. - const __m128i* pPixSrc = (const __m128i*)pSrc; - - // Unswizzle from SWR-Z order - __m128i** pvDsts = (__m128i**)&ppDsts[0]; - *pvDsts[0] = pPixSrc[0]; - *pvDsts[1] = pPixSrc[2]; - *pvDsts[2] = pPixSrc[1]; - *pvDsts[3] = pPixSrc[3]; - *pvDsts[4] = pPixSrc[4]; - *pvDsts[5] = pPixSrc[6]; - *pvDsts[6] = pPixSrc[5]; - *pvDsts[7] = pPixSrc[7]; - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -////////////////////////////////////////////////////////////////////////// -template -struct ConvertPixelsSOAtoAOS -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SrcFormat --> DstFormat - simdvector src; - LoadSOA(pSrc, src); - StoreSOA(src, soaTile); - - // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose(soaTile, aosTile); - - // Store data into destination - StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -/// Specialization for no format conversion -////////////////////////////////////////////////////////////////////////// -template -struct ConvertPixelsSOAtoAOS -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose(pSrc, aosTile); - - // Store data into destination - StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM -////////////////////////////////////////////////////////////////////////// -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; - static const SWR_FORMAT DstFormat = B5G6R5_UNORM; - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Load hot-tile - simdvector src, dst; - LoadSOA(pSrc, src); - - // deswizzle - dst.x = src[FormatTraits::swizzle(0)]; - dst.y = src[FormatTraits::swizzle(1)]; - dst.z = src[FormatTraits::swizzle(2)]; - - // clamp - dst.x = Clamp(dst.x, 0); - dst.y = Clamp(dst.y, 1); - dst.z = Clamp(dst.z, 2); - - // normalize - dst.x = Normalize(dst.x, 0); - dst.y = Normalize(dst.y, 1); - dst.z = Normalize(dst.z, 2); - - // pack - simdscalari packed = _simd_castps_si(dst.x); - packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits::GetBPC(0))); - packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits::GetBPC(0) + - FormatTraits::GetBPC(1))); - - // pack low 16 bits of each 32 bit lane to low 128 bits of dst - uint32_t *pPacked = (uint32_t*)&packed; - uint16_t *pAosTile = (uint16_t*)&aosTile[0]; - for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) - { - *pAosTile++ = *pPacked++; - } - - // Store data into destination - StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -////////////////////////////////////////////////////////////////////////// -template<> -struct ConvertPixelsSOAtoAOS -{ - static const SWR_FORMAT SrcFormat = R32_FLOAT; - static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Converts a SIMD from the Hot Tile to the destination format - /// and converts from SOA to AOS. - /// @param pSrc - Pointer to raster tile. - /// @param pDst - Pointer to destination surface or deswizzling buffer. - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SrcFormat --> DstFormat - simdvector src; - LoadSOA(pSrc, src); - StoreSOA(src, soaTile); - - // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose(soaTile, aosTile); - - // Store data into destination but don't overwrite the X8 bits - // Each 4-pixel row is 16-bytes - __m128i *pZRow01 = (__m128i*)aosTile; - __m128i vQuad00 = _mm_load_si128(pZRow01); - __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); - - __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); - __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); - - __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); - __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); - - __m128i vMask = _mm_set1_epi32(0xFFFFFF); - - vDst0 = _mm_andnot_si128(vMask, vDst0); - vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); - vDst1 = _mm_andnot_si128(vMask, vDst1); - vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); - - _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); - _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); - } -}; - -template -INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) -{ - static const uint32_t offset = sizeof(simdscalar); - - // swizzle rgba -> bgra while we load - simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr - simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg - simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb - simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(3))*offset)); // float32 aaaaaaaa - - // clamp - vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); - vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); - - vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); - vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); - - vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); - vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); - - vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); - vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); - - if (FormatTraits::isSRGB) - { - // Gamma-correct only rgb - vComp0 = FormatTraits::convertSrgb(0, vComp0); - vComp1 = FormatTraits::convertSrgb(1, vComp1); - vComp2 = FormatTraits::convertSrgb(2, vComp2); - } - - // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format - vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); - vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); - vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); - vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits::fromFloat(3))); - - // moving to 8 wide integer vector types - __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr - __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg - __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb - __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa - -#if KNOB_ARCH == KNOB_ARCH_AVX - - // splitting into two sets of 4 wide integer vector types - // because AVX doesn't have instructions to support this operation at 8 wide - __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r - __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g - __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b - __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a - - __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r - __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g - __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b - __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a - - srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 - srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 - srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 - srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 - srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 - srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 - - srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr - srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 - - srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr - srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 - - srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr - srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr - - // unpack into rows that get the tiling order correct - __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr - __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); - - __m256i final = _mm256_castsi128_si256(vRow00); - final = _mm256_insertf128_si256(final, vRow10, 1); - -#elif KNOB_ARCH >= KNOB_ARCH_AVX2 - - // logic is as above, only wider - src1 = _mm256_slli_si256(src1, 1); - src2 = _mm256_slli_si256(src2, 2); - src3 = _mm256_slli_si256(src3, 3); - - src0 = _mm256_or_si256(src0, src1); - src2 = _mm256_or_si256(src2, src3); - - __m256i final = _mm256_or_si256(src0, src2); - - // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 - final = _mm256_permute4x64_epi64(final, 0xD8); - -#endif - - _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); -} - -template -INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) -{ - static const uint32_t offset = sizeof(simdscalar); - - // swizzle rgba -> bgra while we load - simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr - simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg - simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb - // clamp - vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); - vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); - - vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); - vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); - - vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); - vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); - - if (FormatTraits::isSRGB) - { - // Gamma-correct only rgb - vComp0 = FormatTraits::convertSrgb(0, vComp0); - vComp1 = FormatTraits::convertSrgb(1, vComp1); - vComp2 = FormatTraits::convertSrgb(2, vComp2); - } - - // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format - vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); - vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); - vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); - - // moving to 8 wide integer vector types - __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr - __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg - __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb - -#if KNOB_ARCH == KNOB_ARCH_AVX - - // splitting into two sets of 4 wide integer vector types - // because AVX doesn't have instructions to support this operation at 8 wide - __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r - __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g - __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b - - __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r - __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g - __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b - - srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 - srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 - srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 - srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 - - srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr - - srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr - - srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr - srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr - - // unpack into rows that get the tiling order correct - __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr - __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); - - __m256i final = _mm256_castsi128_si256(vRow00); - final = _mm256_insertf128_si256(final, vRow10, 1); - -#elif KNOB_ARCH >= KNOB_ARCH_AVX2 - - // logic is as above, only wider - src1 = _mm256_slli_si256(src1, 1); - src2 = _mm256_slli_si256(src2, 2); - - src0 = _mm256_or_si256(src0, src1); - - __m256i final = _mm256_or_si256(src0, src2); - - // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 - final = _mm256_permute4x64_epi64(final, 0xD8); - -#endif - - _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); -} - -template<> -struct ConvertPixelsSOAtoAOS -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -template<> -struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > -{ - template - INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) - { - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StoreRasterTile -////////////////////////////////////////////////////////////////////////// -template -struct StoreRasterTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve color from hot tile source which is always float. - /// @param pSrc - Pointer to raster tile. - /// @param x, y - Coordinates to raster tile. - /// @param output - output color - INLINE static void GetSwizzledSrcColor( - uint8_t* pSrc, - uint32_t x, uint32_t y, - float outputColor[4]) - { - typedef SimdTile SimdT; - - SimdT* pSrcSimdTiles = (SimdT*)pSrc; - - // Compute which simd tile we're accessing within 8x8 tile. - // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. - uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); - - SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; - - uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); - - pSimdTile->GetSwizzledColor(simdOffset, outputColor); - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. - { - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - - // For each raster tile pixel (rx, ry) - for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) - { - for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) - { - // Perform bounds checking. - if (((x + rx) < lodWidth) && - ((y + ry) < lodHeight)) - { - float srcColor[4]; - GetSwizzledSrcColor(pSrc, rx, ry, srcColor); - - uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress((x + rx), (y + ry), - pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, - sampleNum, pDstSurface->lod, pDstSurface); - ConvertPixelFromFloat(pDst, srcColor); - } - } - } - } -}; - -template -struct OptStoreRasterTile : StoreRasterTile -{}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); - - ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; - } - - ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); - - ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; - } - - ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); - - ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; - } - - ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; - static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 - }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = - { - ppDsts[0], - ppDsts[1], - ppDsts[2], - ppDsts[3], - }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - - ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; - ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; - ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; - ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; - pSrc += SRC_COLUMN_BYTES; - } - - ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; - ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; - static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - struct DstPtrs - { - uint8_t* ppDsts[8]; - } ptrs; - - // Need 8 pointers, 4 columns of 2 rows each - for (uint32_t y = 0; y < 2; ++y) - { - for (uint32_t x = 0; x < 4; ++x) - { - ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; - } - } - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - DstPtrs startPtrs = ptrs; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); - - ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; - pSrc += SRC_COLUMN_BYTES; - } - - ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; - ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; - ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; - ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; - ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; - ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; - ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; - ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - - uint8_t* pRow = pCol0 + rowOffset; - uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestRowWidthBytes / 4; - ppDsts[1] += DestRowWidthBytes / 4; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - - uint8_t* pRow = pCol0 + rowOffset; - uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestRowWidthBytes / 2; - ppDsts[1] += DestRowWidthBytes / 2; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 512; // 512B rows - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* pRow1 = pRow0 + DestRowWidthBytes; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) - { - uint32_t xRowOffset = col * (FormatTraits::bpp / 8); - - uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - pSrc += (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - } - - pRow0 += (DestRowWidthBytes * 2); - pRow1 += (DestRowWidthBytes * 2); - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - - uint8_t* pRow = pCol0 + rowOffset; - uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestColumnBytes; - ppDsts[1] += DestColumnBytes; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - static const uint32_t DestRowWidthBytes = 16; // 16B rows - static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. - - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. - // We can compute the offsets to each column within the raster tile once and increment from these. - // There will be 2 x 4-wide columns in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* pCol1 = pCol0 + DestColumnBytes; - - // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - uint8_t* ppDsts[] = - { - pCol0 + rowOffset, - pCol0 + rowOffset + DestRowWidthBytes, - pCol1 + rowOffset, - pCol1 + rowOffset + DestRowWidthBytes, - }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestColumnBytes * 2; - ppDsts[1] += DestColumnBytes * 2; - ppDsts[2] += DestColumnBytes * 2; - ppDsts[3] += DestColumnBytes * 2; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp -////////////////////////////////////////////////////////////////////////// -template -struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -{ - typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; - - static const size_t TILE_Y_COL_WIDTH_BYTES = 16; - static const size_t TILE_Y_ROWS = 32; - static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; - - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; - static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; - - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores an 8x8 raster tile to the destination surface. - /// @param pSrc - Pointer to raster tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to raster tile. - INLINE static void Store( - uint8_t *pSrc, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) - { - // Punt non-full tiles to generic store - uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); - uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); - if (x + KNOB_TILE_X_DIM > lodWidth || - y + KNOB_TILE_Y_DIM > lodHeight) - { - return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); - } - - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - struct DstPtrs - { - uint8_t* ppDsts[8]; - } ptrs; - - // Need 8 pointers, 4 columns of 2 rows each - for (uint32_t y = 0; y < 2; ++y) - { - for (uint32_t x = 0; x < 4; ++x) - { - ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; - } - } - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - DstPtrs startPtrs = ptrs; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); - - ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; - pSrc += SRC_COLUMN_BYTES; - } - - ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; - } - } -}; - -////////////////////////////////////////////////////////////////////////// -/// StoreMacroTile - Stores a macro tile which consists of raster tiles. -////////////////////////////////////////////////////////////////////////// -template -struct StoreMacroTile -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores a macrotile to the destination surface using safe implementation. - /// @param pSrc - Pointer to macro tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void StoreGeneric( - uint8_t *pSrcHotTile, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) - { - // Store each raster tile from the hot tile to the destination surface. - for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - StoreRasterTile::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, - renderTargetArrayIndex); - pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); - } - } - } - } - - typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); - ////////////////////////////////////////////////////////////////////////// - /// @brief Stores a macrotile to the destination surface. - /// @param pSrc - Pointer to macro tile. - /// @param pDstSurface - Destination surface state - /// @param x, y - Coordinates to macro tile - static void Store( - uint8_t *pSrcHotTile, - SWR_SURFACE_STATE* pDstSurface, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) - { - PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; - for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - size_t dstSurfAddress = (size_t)ComputeSurfaceAddress( - 0, - 0, - pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces - pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays - sampleNum, - pDstSurface->lod, - pDstSurface); - - // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear - bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || (pDstSurface->bInterleavedSamples); - - pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile::Store : OptStoreRasterTile::Store; - } - - // Store each raster tile from the hot tile to the destination surface. - for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) - { - pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); - pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); - } - } - } - } -}; +PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; +PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; +PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; static void BUCKETS_START(UINT id) { @@ -1503,7 +58,7 @@ static std::vector sBuckets(NUM_SWR_FORMATS, -1); /// @param renderTargetIndex - Index to destination render target /// @param x, y - Coordinates to raster tile. /// @param pSrcHotTile - Pointer to Hot Tile -void StoreHotTile( +void StoreHotTileToSurface( SWR_SURFACE_STATE *pDstSurface, SWR_FORMAT srcFormat, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, @@ -1563,139 +118,6 @@ void StoreHotTile( BUCKETS_STOP(sBuckets[pDstSurface->format]); } -////////////////////////////////////////////////////////////////////////// -/// InitStoreTilesTable - Helper for setting up the tables. -template -void InitStoreTilesTableColor( - PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) -{ - table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; - table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; - table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; - table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; - table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; - table[TileModeT][R32G32B32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; - table[TileModeT][R32G32B32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; - table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; - table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; - table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; - table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; - table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; - table[TileModeT][R32G32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; - table[TileModeT][R32G32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_SINT>::Store; - table[TileModeT][R32G32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_UINT>::Store; - table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; - table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; - table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; - table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; - - // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now - table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; - table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; - table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; - - table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; - table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; - table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; - table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; - table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; - table[TileModeT][R16G16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; - table[TileModeT][R16G16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; - table[TileModeT][R16G16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SINT>::Store; - table[TileModeT][R16G16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UINT>::Store; - table[TileModeT][R16G16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; - - // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now - table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; - table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; - table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; - - table[TileModeT][R32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_SINT>::Store; - table[TileModeT][R32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_UINT>::Store; - table[TileModeT][R32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_FLOAT>::Store; - table[TileModeT][A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A32_FLOAT>::Store; - table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; - table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; - table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; - table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; - - // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now - table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; - table[TileModeT][B5G6R5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; - table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; - table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; - table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; - table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; - table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; - - table[TileModeT][R8G8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; - table[TileModeT][R8G8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; - table[TileModeT][R8G8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SINT>::Store; - table[TileModeT][R8G8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UINT>::Store; - table[TileModeT][R16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UNORM>::Store; - table[TileModeT][R16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SNORM>::Store; - table[TileModeT][R16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SINT>::Store; - table[TileModeT][R16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UINT>::Store; - table[TileModeT][R16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_FLOAT>::Store; - table[TileModeT][A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A16_UNORM>::Store; - table[TileModeT][A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A16_FLOAT>::Store; - - // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now - table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; - table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; - - table[TileModeT][R8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UNORM>::Store; - table[TileModeT][R8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SNORM>::Store; - table[TileModeT][R8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SINT>::Store; - table[TileModeT][R8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UINT>::Store; - table[TileModeT][A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A8_UNORM>::Store; - table[TileModeT][BC1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM>::Store; - table[TileModeT][BC2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM>::Store; - table[TileModeT][BC3_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM>::Store; - table[TileModeT][BC4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_UNORM>::Store; - table[TileModeT][BC5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_UNORM>::Store; - table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store; - table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store; - table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store; - table[TileModeT][R8G8B8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; - table[TileModeT][R8G8B8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; - table[TileModeT][BC4_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_SNORM>::Store; - table[TileModeT][BC5_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_SNORM>::Store; - table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; - table[TileModeT][R16G16B16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; - table[TileModeT][R16G16B16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; - table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; - table[TileModeT][R16G16B16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; - table[TileModeT][R16G16B16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; - - // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now - table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; - table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; - table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; - table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; - table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; - - table[TileModeT][R8G8B8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; - table[TileModeT][R8G8B8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; -} - -////////////////////////////////////////////////////////////////////////// -/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -template -void InitStoreTilesTableDepth( - PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) -{ - table[TileModeT][R32_FLOAT] = StoreMacroTile, R32_FLOAT, R32_FLOAT>::Store; - table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; - table[TileModeT][R16_UNORM] = StoreMacroTile, R32_FLOAT, R16_UNORM>::Store; -} - -template -void InitStoreTilesTableStencil( - PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) -{ - table[TileModeT][R8_UINT] = StoreMacroTile, R8_UINT, R8_UINT>::Store; -} ////////////////////////////////////////////////////////////////////////// /// @brief Sets up tables for StoreTile @@ -1704,16 +126,11 @@ void InitSimStoreTilesTable() memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor)); memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth)); - InitStoreTilesTableColor(sStoreTilesTableColor); - InitStoreTilesTableDepth(sStoreTilesTableDepth); - InitStoreTilesTableStencil(sStoreTilesTableStencil); - - InitStoreTilesTableColor(sStoreTilesTableColor); - InitStoreTilesTableColor(sStoreTilesTableColor); - - InitStoreTilesTableDepth(sStoreTilesTableDepth); - InitStoreTilesTableStencil(sStoreTilesTableStencil); - - // special color hot tile -> 8-bit WMAJOR - sStoreTilesTableColor[SWR_TILE_MODE_WMAJOR][R8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UINT>::Store; + InitStoreTilesTable_Linear_1(); + InitStoreTilesTable_Linear_2(); + InitStoreTilesTable_TileX_1(); + InitStoreTilesTable_TileX_2(); + InitStoreTilesTable_TileY_1(); + InitStoreTilesTable_TileY_2(); + InitStoreTilesTable_TileW(); } diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h new file mode 100644 index 00000000000..af3be09585f --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -0,0 +1,1637 @@ +/**************************************************************************** +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile.h +* +* @brief Functionality for Store. +* +******************************************************************************/ +#pragma once + +#include "common/os.h" +#include "common/formats.h" +#include "core/context.h" +#include "core/rdtsc_core.h" +#include "core/format_conversion.h" + +#include "memory/TilingFunctions.h" +#include "memory/tilingtraits.h" +#include "memory/Convert.h" +#include "core/multisample.h" + +#include +#include + +// Function pointer to different storing functions for color, depth, and stencil based on incoming formats. +typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); + +////////////////////////////////////////////////////////////////////////// +/// Store Raster Tile Function Tables. +////////////////////////////////////////////////////////////////////////// +extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; +extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; +extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; + +void InitStoreTilesTable_Linear_1(); +void InitStoreTilesTable_Linear_2(); +void InitStoreTilesTable_TileX_1(); +void InitStoreTilesTable_TileX_2(); +void InitStoreTilesTable_TileY_1(); +void InitStoreTilesTable_TileY_2(); +void InitStoreTilesTable_TileW(); +void InitStoreTilesTable(); + +////////////////////////////////////////////////////////////////////////// +/// StorePixels +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template +struct StorePixels +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<8, 2> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) + { + // Each 4-pixel row is 4 bytes. + const uint16_t* pPixSrc = (const uint16_t*)pSrc; + + // Unswizzle from SWR-Z order + uint16_t* pRow = (uint16_t*)ppDsts[0]; + pRow[0] = pPixSrc[0]; + pRow[1] = pPixSrc[2]; + + pRow = (uint16_t*)ppDsts[1]; + pRow[0] = pPixSrc[1]; + pRow[1] = pPixSrc[3]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<16, 2> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) + { + // Each 4-pixel row is 8 bytes. + const uint32_t* pPixSrc = (const uint32_t*)pSrc; + + // Unswizzle from SWR-Z order + uint32_t* pRow = (uint32_t*)ppDsts[0]; + pRow[0] = pPixSrc[0]; + pRow[1] = pPixSrc[2]; + + pRow = (uint32_t*)ppDsts[1]; + pRow[0] = pPixSrc[1]; + pRow[1] = pPixSrc[3]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<32, 2> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) + { + // Each 4-pixel row is 16-bytes + __m128i *pZRow01 = (__m128i*)pSrc; + __m128i vQuad00 = _mm_load_si128(pZRow01); + __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); + + __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); + __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); + + _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); + _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<64, 4> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) + { + // Each 4-pixel row is 32 bytes. + const __m128i* pPixSrc = (const __m128i*)pSrc; + + // order of pointers match SWR-Z layout + __m128i** pvDsts = (__m128i**)&ppDsts[0]; + *pvDsts[0] = pPixSrc[0]; + *pvDsts[1] = pPixSrc[1]; + *pvDsts[2] = pPixSrc[2]; + *pvDsts[3] = pPixSrc[3]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StorePixels (32-bit pixel specialization) +/// @brief Stores a 4x2 (AVX) raster-tile to two rows. +/// @param pSrc - Pointer to source raster tile in SWRZ pixel order +/// @param ppDsts - Array of destination pointers. Each pointer is +/// to a single row of at most 16B. +/// @tparam NumDests - Number of destination pointers. Each pair of +/// pointers is for a 16-byte column of two rows. +////////////////////////////////////////////////////////////////////////// +template <> +struct StorePixels<128, 8> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) + { + // Each 4-pixel row is 64 bytes. + const __m128i* pPixSrc = (const __m128i*)pSrc; + + // Unswizzle from SWR-Z order + __m128i** pvDsts = (__m128i**)&ppDsts[0]; + *pvDsts[0] = pPixSrc[0]; + *pvDsts[1] = pPixSrc[2]; + *pvDsts[2] = pPixSrc[1]; + *pvDsts[3] = pPixSrc[3]; + *pvDsts[4] = pPixSrc[4]; + *pvDsts[5] = pPixSrc[6]; + *pvDsts[6] = pPixSrc[5]; + *pvDsts[7] = pPixSrc[7]; + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) +////////////////////////////////////////////////////////////////////////// +template +struct ConvertPixelsSOAtoAOS +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Convert from SrcFormat --> DstFormat + simdvector src; + LoadSOA(pSrc, src); + StoreSOA(src, soaTile); + + // Convert from SOA --> AOS + FormatTraits::TransposeT::Transpose(soaTile, aosTile); + + // Store data into destination + StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) +/// Specialization for no format conversion +////////////////////////////////////////////////////////////////////////// +template +struct ConvertPixelsSOAtoAOS +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Convert from SOA --> AOS + FormatTraits::TransposeT::Transpose(pSrc, aosTile); + + // Store data into destination + StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM +////////////////////////////////////////////////////////////////////////// +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; + static const SWR_FORMAT DstFormat = B5G6R5_UNORM; + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Load hot-tile + simdvector src, dst; + LoadSOA(pSrc, src); + + // deswizzle + dst.x = src[FormatTraits::swizzle(0)]; + dst.y = src[FormatTraits::swizzle(1)]; + dst.z = src[FormatTraits::swizzle(2)]; + + // clamp + dst.x = Clamp(dst.x, 0); + dst.y = Clamp(dst.y, 1); + dst.z = Clamp(dst.z, 2); + + // normalize + dst.x = Normalize(dst.x, 0); + dst.y = Normalize(dst.y, 1); + dst.z = Normalize(dst.z, 2); + + // pack + simdscalari packed = _simd_castps_si(dst.x); + packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits::GetBPC(0))); + packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits::GetBPC(0) + + FormatTraits::GetBPC(1))); + + // pack low 16 bits of each 32 bit lane to low 128 bits of dst + uint32_t *pPacked = (uint32_t*)&packed; + uint16_t *pAosTile = (uint16_t*)&aosTile[0]; + for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) + { + *pAosTile++ = *pPacked++; + } + + // Store data into destination + StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) +////////////////////////////////////////////////////////////////////////// +template<> +struct ConvertPixelsSOAtoAOS +{ + static const SWR_FORMAT SrcFormat = R32_FLOAT; + static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Converts a SIMD from the Hot Tile to the destination format + /// and converts from SOA to AOS. + /// @param pSrc - Pointer to raster tile. + /// @param pDst - Pointer to destination surface or deswizzling buffer. + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel + + OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; + OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Convert from SrcFormat --> DstFormat + simdvector src; + LoadSOA(pSrc, src); + StoreSOA(src, soaTile); + + // Convert from SOA --> AOS + FormatTraits::TransposeT::Transpose(soaTile, aosTile); + + // Store data into destination but don't overwrite the X8 bits + // Each 4-pixel row is 16-bytes + __m128i *pZRow01 = (__m128i*)aosTile; + __m128i vQuad00 = _mm_load_si128(pZRow01); + __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); + + __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); + __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); + + __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); + __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); + + __m128i vMask = _mm_set1_epi32(0xFFFFFF); + + vDst0 = _mm_andnot_si128(vMask, vDst0); + vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); + vDst1 = _mm_andnot_si128(vMask, vDst1); + vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); + + _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); + _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); + } +}; + +template +INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) +{ + static const uint32_t offset = sizeof(simdscalar); + + // swizzle rgba -> bgra while we load + simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr + simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg + simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb + simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(3))*offset)); // float32 aaaaaaaa + + // clamp + vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); + vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); + + vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); + vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); + + vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); + vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); + + vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); + vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); + + if (FormatTraits::isSRGB) + { + // Gamma-correct only rgb + vComp0 = FormatTraits::convertSrgb(0, vComp0); + vComp1 = FormatTraits::convertSrgb(1, vComp1); + vComp2 = FormatTraits::convertSrgb(2, vComp2); + } + + // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format + vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); + vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); + vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); + vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits::fromFloat(3))); + + // moving to 8 wide integer vector types + __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr + __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg + __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb + __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa + +#if KNOB_ARCH == KNOB_ARCH_AVX + + // splitting into two sets of 4 wide integer vector types + // because AVX doesn't have instructions to support this operation at 8 wide + __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r + __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g + __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b + __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a + + __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r + __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g + __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b + __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a + + srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 + srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 + srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 + srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 + srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 + srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 + + srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr + srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 + + srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr + srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 + + srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr + srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr + + // unpack into rows that get the tiling order correct + __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr + __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); + + __m256i final = _mm256_castsi128_si256(vRow00); + final = _mm256_insertf128_si256(final, vRow10, 1); + +#elif KNOB_ARCH >= KNOB_ARCH_AVX2 + + // logic is as above, only wider + src1 = _mm256_slli_si256(src1, 1); + src2 = _mm256_slli_si256(src2, 2); + src3 = _mm256_slli_si256(src3, 3); + + src0 = _mm256_or_si256(src0, src1); + src2 = _mm256_or_si256(src2, src3); + + __m256i final = _mm256_or_si256(src0, src2); + + // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 + final = _mm256_permute4x64_epi64(final, 0xD8); + +#endif + + _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); +} + +template +INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) +{ + static const uint32_t offset = sizeof(simdscalar); + + // swizzle rgba -> bgra while we load + simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr + simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg + simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb + // clamp + vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); + vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); + + vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); + vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); + + vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); + vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); + + if (FormatTraits::isSRGB) + { + // Gamma-correct only rgb + vComp0 = FormatTraits::convertSrgb(0, vComp0); + vComp1 = FormatTraits::convertSrgb(1, vComp1); + vComp2 = FormatTraits::convertSrgb(2, vComp2); + } + + // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format + vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); + vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); + vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); + + // moving to 8 wide integer vector types + __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr + __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg + __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb + +#if KNOB_ARCH == KNOB_ARCH_AVX + + // splitting into two sets of 4 wide integer vector types + // because AVX doesn't have instructions to support this operation at 8 wide + __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r + __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g + __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b + + __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r + __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g + __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b + + srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 + srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 + srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 + srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 + + srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr + + srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr + + srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr + srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr + + // unpack into rows that get the tiling order correct + __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr + __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); + + __m256i final = _mm256_castsi128_si256(vRow00); + final = _mm256_insertf128_si256(final, vRow10, 1); + +#elif KNOB_ARCH >= KNOB_ARCH_AVX2 + + // logic is as above, only wider + src1 = _mm256_slli_si256(src1, 1); + src2 = _mm256_slli_si256(src2, 2); + + src0 = _mm256_or_si256(src0, src1); + + __m256i final = _mm256_or_si256(src0, src2); + + // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 + final = _mm256_permute4x64_epi64(final, 0xD8); + +#endif + + _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); +} + +template<> +struct ConvertPixelsSOAtoAOS +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvert(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +template<> +struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > +{ + template + INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) + { + FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StoreRasterTile +////////////////////////////////////////////////////////////////////////// +template +struct StoreRasterTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Retrieve color from hot tile source which is always float. + /// @param pSrc - Pointer to raster tile. + /// @param x, y - Coordinates to raster tile. + /// @param output - output color + INLINE static void GetSwizzledSrcColor( + uint8_t* pSrc, + uint32_t x, uint32_t y, + float outputColor[4]) + { + typedef SimdTile SimdT; + + SimdT* pSrcSimdTiles = (SimdT*)pSrc; + + // Compute which simd tile we're accessing within 8x8 tile. + // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. + uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); + + SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; + + uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); + + pSimdTile->GetSwizzledColor(simdOffset, outputColor); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. + { + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + + // For each raster tile pixel (rx, ry) + for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) + { + for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) + { + // Perform bounds checking. + if (((x + rx) < lodWidth) && + ((y + ry) < lodHeight)) + { + float srcColor[4]; + GetSwizzledSrcColor(pSrc, rx, ry, srcColor); + + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress((x + rx), (y + ry), + pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, + sampleNum, pDstSurface->lod, pDstSurface); + { + ConvertPixelFromFloat(pDst, srcColor); + } + } + } + } + } +}; + +template +struct OptStoreRasterTile : StoreRasterTile +{}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); + + ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; + } + + ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); + + ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; + } + + ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); + + ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; + } + + ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; + static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* ppDsts[] = + { + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 + }; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + uint8_t* ppStartRows[] = + { + ppDsts[0], + ppDsts[1], + ppDsts[2], + ppDsts[3], + }; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + + ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; + ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; + ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; + ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; + pSrc += SRC_COLUMN_BYTES; + } + + ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; + ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; + ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; + ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; + static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + struct DstPtrs + { + uint8_t* ppDsts[8]; + } ptrs; + + // Need 8 pointers, 4 columns of 2 rows each + for (uint32_t y = 0; y < 2; ++y) + { + for (uint32_t x = 0; x < 4; ++x) + { + ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; + } + } + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + DstPtrs startPtrs = ptrs; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); + + ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; + pSrc += SRC_COLUMN_BYTES; + } + + ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; + ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; + ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; + ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; + ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; + ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; + ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; + ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + + uint8_t* pRow = pCol0 + rowOffset; + uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestRowWidthBytes / 4; + ppDsts[1] += DestRowWidthBytes / 4; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + + uint8_t* pRow = pCol0 + rowOffset; + uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestRowWidthBytes / 2; + ppDsts[1] += DestRowWidthBytes / 2; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 512; // 512B rows + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* pRow1 = pRow0 + DestRowWidthBytes; + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) + { + uint32_t xRowOffset = col * (FormatTraits::bpp / 8); + + uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + pSrc += (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; + } + + pRow0 += (DestRowWidthBytes * 2); + pRow1 += (DestRowWidthBytes * 2); + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + + uint8_t* pRow = pCol0 + rowOffset; + uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestColumnBytes; + ppDsts[1] += DestColumnBytes; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + static const uint32_t DestRowWidthBytes = 16; // 16B rows + static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. + + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. + // We can compute the offsets to each column within the raster tile once and increment from these. + // There will be 2 x 4-wide columns in an 8x8 raster tile. + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + uint8_t* pCol1 = pCol0 + DestColumnBytes; + + // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. + uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) + { + uint32_t rowOffset = row * DestRowWidthBytes; + uint8_t* ppDsts[] = + { + pCol0 + rowOffset, + pCol0 + rowOffset + DestRowWidthBytes, + pCol1 + rowOffset, + pCol1 + rowOffset + DestRowWidthBytes, + }; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + + ppDsts[0] += DestColumnBytes * 2; + ppDsts[1] += DestColumnBytes * 2; + ppDsts[2] += DestColumnBytes * 2; + ppDsts[3] += DestColumnBytes * 2; + + ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); + pSrc += pSrcInc; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp +////////////////////////////////////////////////////////////////////////// +template +struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> +{ + typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; + + static const size_t TILE_Y_COL_WIDTH_BYTES = 16; + static const size_t TILE_Y_ROWS = 32; + static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; + + static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; + static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; + + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores an 8x8 raster tile to the destination surface. + /// @param pSrc - Pointer to raster tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to raster tile. + INLINE static void Store( + uint8_t *pSrc, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) + { + // Punt non-full tiles to generic store + uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); + uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); + if (x + KNOB_TILE_X_DIM > lodWidth || + y + KNOB_TILE_Y_DIM > lodHeight) + { + return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); + } + + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + struct DstPtrs + { + uint8_t* ppDsts[8]; + } ptrs; + + // Need 8 pointers, 4 columns of 2 rows each + for (uint32_t y = 0; y < 2; ++y) + { + for (uint32_t x = 0; x < 4; ++x) + { + ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; + } + } + + for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) + { + DstPtrs startPtrs = ptrs; + + for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) + { + // Format conversion and convert from SOA to AOS, and store the rows. + ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); + + ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; + ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; + pSrc += SRC_COLUMN_BYTES; + } + + ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; + ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// StoreMacroTile - Stores a macro tile which consists of raster tiles. +////////////////////////////////////////////////////////////////////////// +template +struct StoreMacroTile +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores a macrotile to the destination surface using safe implementation. + /// @param pSrc - Pointer to macro tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void StoreGeneric( + uint8_t *pSrcHotTile, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) + { + PFN_STORE_TILES_INTERNAL pfnStore; + pfnStore = StoreRasterTile::Store; + + // Store each raster tile from the hot tile to the destination surface. + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) + { + pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); + pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); + } + } + } + + } + + typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); + ////////////////////////////////////////////////////////////////////////// + /// @brief Stores a macrotile to the destination surface. + /// @param pSrc - Pointer to macro tile. + /// @param pDstSurface - Destination surface state + /// @param x, y - Coordinates to macro tile + static void Store( + uint8_t *pSrcHotTile, + SWR_SURFACE_STATE* pDstSurface, + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) + { + PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; + + for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) + { + size_t dstSurfAddress = (size_t)ComputeSurfaceAddress( + 0, + 0, + pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces + pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays + sampleNum, + pDstSurface->lod, + pDstSurface); + + // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear + bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || + (pDstSurface->bInterleavedSamples); + + pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile::Store : OptStoreRasterTile::Store; + } + + // Store each raster tile from the hot tile to the destination surface. + for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) + { + pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); + pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); + } + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// +/// InitStoreTilesTable - Helper for setting up the tables. +template +void InitStoreTilesTableColor_Half1( + PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) +{ + table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; + table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; + table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; + table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; + table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; + table[TileModeT][R32G32B32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; + table[TileModeT][R32G32B32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; + table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; + table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; + table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; + table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; + table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; + table[TileModeT][R32G32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; + table[TileModeT][R32G32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_SINT>::Store; + table[TileModeT][R32G32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_UINT>::Store; + table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; + table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; + table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; + table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; + table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; + table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; + + table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; + table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; + table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; + table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; + table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; + table[TileModeT][R16G16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; + table[TileModeT][R16G16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; + table[TileModeT][R16G16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SINT>::Store; + table[TileModeT][R16G16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UINT>::Store; + table[TileModeT][R16G16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; + table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; + table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; + + table[TileModeT][R32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_SINT>::Store; + table[TileModeT][R32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_UINT>::Store; + table[TileModeT][R32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_FLOAT>::Store; + table[TileModeT][A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A32_FLOAT>::Store; + table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; + table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; + table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; + table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; +} + +template +void InitStoreTilesTableColor_Half2( + PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT]) +{ + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; + table[TileModeT][B5G6R5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; + table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; + table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; + table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; + table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; + table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; + + table[TileModeT][R8G8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; + table[TileModeT][R8G8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; + table[TileModeT][R8G8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SINT>::Store; + table[TileModeT][R8G8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UINT>::Store; + table[TileModeT][R16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UNORM>::Store; + table[TileModeT][R16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SNORM>::Store; + table[TileModeT][R16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SINT>::Store; + table[TileModeT][R16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UINT>::Store; + table[TileModeT][R16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_FLOAT>::Store; + table[TileModeT][A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A16_UNORM>::Store; + table[TileModeT][A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A16_FLOAT>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; + table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; + + table[TileModeT][R8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UNORM>::Store; + table[TileModeT][R8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SNORM>::Store; + table[TileModeT][R8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SINT>::Store; + table[TileModeT][R8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UINT>::Store; + table[TileModeT][A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A8_UNORM>::Store; + table[TileModeT][BC1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM>::Store; + table[TileModeT][BC2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM>::Store; + table[TileModeT][BC3_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM>::Store; + table[TileModeT][BC4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_UNORM>::Store; + table[TileModeT][BC5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_UNORM>::Store; + table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store; + table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store; + table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store; + table[TileModeT][R8G8B8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; + table[TileModeT][R8G8B8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; + table[TileModeT][BC4_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_SNORM>::Store; + table[TileModeT][BC5_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_SNORM>::Store; + table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; + table[TileModeT][R16G16B16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; + table[TileModeT][R16G16B16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; + table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; + table[TileModeT][R16G16B16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; + table[TileModeT][R16G16B16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; + + // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now + table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; + table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; + table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; + table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; + table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; + + table[TileModeT][R8G8B8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; + table[TileModeT][R8G8B8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; +} + +////////////////////////////////////////////////////////////////////////// +/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. +template +void InitStoreTilesTableDepth( + PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) +{ + table[TileModeT][R32_FLOAT] = StoreMacroTile, R32_FLOAT, R32_FLOAT>::Store; + table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; + table[TileModeT][R16_UNORM] = StoreMacroTile, R32_FLOAT, R16_UNORM>::Store; +} + +template +void InitStoreTilesTableStencil( + PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) +{ + table[TileModeT][R8_UINT] = StoreMacroTile, R8_UINT, R8_UINT>::Store; +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp new file mode 100644 index 00000000000..c72063f6f1d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear.cpp @@ -0,0 +1,35 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_Linear.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_Linear_1() +{ + InitStoreTilesTableColor_Half1(sStoreTilesTableColor); + InitStoreTilesTableDepth(sStoreTilesTableDepth); + InitStoreTilesTableStencil(sStoreTilesTableStencil); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp new file mode 100644 index 00000000000..035e685e261 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_Linear2.cpp @@ -0,0 +1,33 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_Linear.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_Linear_2() +{ + InitStoreTilesTableColor_Half2(sStoreTilesTableColor); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp new file mode 100644 index 00000000000..ee4d99d1da0 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileW.cpp @@ -0,0 +1,35 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_TileW.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_TileW() +{ + InitStoreTilesTableStencil(sStoreTilesTableStencil); + // special color hot tile -> 8-bit WMAJOR + sStoreTilesTableColor[SWR_TILE_MODE_WMAJOR][R8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UINT>::Store; +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp new file mode 100644 index 00000000000..7f49a432e92 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX.cpp @@ -0,0 +1,33 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_TIleX.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_TileX_1() +{ + InitStoreTilesTableColor_Half1(sStoreTilesTableColor); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp new file mode 100644 index 00000000000..7e36ebececb --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileX2.cpp @@ -0,0 +1,33 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_TIleX.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_TileX_2() +{ + InitStoreTilesTableColor_Half2(sStoreTilesTableColor); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp new file mode 100644 index 00000000000..dade03f2523 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY.cpp @@ -0,0 +1,34 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_TileY.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_TileY_1() +{ + InitStoreTilesTableColor_Half1(sStoreTilesTableColor); + InitStoreTilesTableDepth(sStoreTilesTableDepth); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp new file mode 100644 index 00000000000..b3ac76759fd --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile_TileY2.cpp @@ -0,0 +1,33 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file StoreTile_TileY.cpp +* +* @brief Functionality for Store. +* +******************************************************************************/ +#include "StoreTile.h" + +void InitStoreTilesTable_TileY_2() +{ + InitStoreTilesTableColor_Half2(sStoreTilesTableColor); +} diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h index a14f3bf3f7c..9b412f8b344 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h @@ -565,6 +565,8 @@ uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array return (uint32_t) NULL; } +typedef void*(*PFN_COMPUTESURFADDR)(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, const SWR_SURFACE_STATE*); + ////////////////////////////////////////////////////////////////////////// /// @brief Computes surface address at the given location and lod /// @param x - x location in pixels @@ -573,7 +575,7 @@ uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array /// @param array - array slice for 1D and 2D surfaces /// @param lod - level of detail /// @param pState - pointer to the surface state -template +template INLINE void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) { diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h index e68dce0f8d1..9ef468a90d4 100644 --- a/src/gallium/drivers/swr/swr_memory.h +++ b/src/gallium/drivers/swr/swr_memory.h @@ -30,7 +30,7 @@ void LoadHotTile( UINT x, UINT y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile); -void StoreHotTile( +void StoreHotTileToSurface( SWR_SURFACE_STATE *pDstSurface, SWR_FORMAT srcFormat, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, @@ -69,7 +69,7 @@ swr_StoreHotTile(HANDLE hPrivateContext, swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; - StoreHotTile(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile); + StoreHotTileToSurface(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile); } INLINE void