swr: [rasterizer] Backend code adjustments
authorTim Rowley <timothy.o.rowley@intel.com>
Thu, 16 Feb 2017 18:53:01 +0000 (10:53 -0800)
committerTim Rowley <timothy.o.rowley@intel.com>
Mon, 20 Mar 2017 23:04:53 +0000 (18:04 -0500)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/common/simdintrin.h
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/state.h

index ea79902a0021ca1c15bc0d350b81d83810c22a20..562408db8db973a1acffdc2c0110ab09eda1951a 100644 (file)
@@ -659,6 +659,15 @@ simdscalar vMask(int32_t mask)
     return _simd_castsi_ps(vec);
 }
 
+INLINE
+simdscalari vMaski(int32_t mask)
+{
+    __m256i vec = _mm256_set1_epi32(mask);
+    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = _simd_and_si(vec, bit);
+    return _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+}
+
 INLINE
 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
 {
index 90503ba6de46cb37919a9b3f2d821af43e2b020d..f622e94d91f568926b92e75bb4672f94af05d911 100644 (file)
@@ -806,7 +806,6 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
         const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
-
         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
@@ -817,7 +816,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage]
+                                                                [centroid][forcedSampleCount][canEarlyZ]
+                    ;
             }
             else
             {
index b1bcdb0b39376d0d9684490f7467aea6dbf3a0c1..b915e327426d84d801ced78867f4ca5b534f2021 100644 (file)
@@ -39,6 +39,7 @@
 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Process compute work.
 /// @param pDC - pointer to draw context (dispatch).
index c3585cc930c26deabb66214937b81e5f4030b011..f022990bf2685e18a13e7060dc16e54927e5c864 100644 (file)
@@ -605,8 +605,10 @@ struct PixelRateZTestLoop
 
             if(psState.writesODepth)
             {
-                // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
-                vZ[sample] = psContext.vZ;
+                {
+                    // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
+                    vZ[sample] = psContext.vZ;
+                }
             }
             else
             {
@@ -713,23 +715,26 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
         uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-        // pfnBlendFunc may not update all channels.  Initialize with PS output.
-        /// TODO: move this into the blend JIT.
-        blendOut = psContext.shaded[rt];
 
-        // Blend outputs and update coverage mask for alpha test
-        if(pfnBlendFunc[rt] != nullptr)
         {
-            pfnBlendFunc[rt](
-                pBlendState,
-                psContext.shaded[rt],
-                psContext.shaded[1],
-                psContext.shaded[0].w,
-                sample,
-                pColorSample,
-                blendOut,
-                &psContext.oMask,
-                (simdscalari*)&coverageMask);
+            // pfnBlendFunc may not update all channels.  Initialize with PS output.
+            /// TODO: move this into the blend JIT.
+            blendOut = psContext.shaded[rt];
+
+            // Blend outputs and update coverage mask for alpha test
+            if(pfnBlendFunc[rt] != nullptr)
+            {
+                pfnBlendFunc[rt](
+                    pBlendState,
+                    psContext.shaded[rt],
+                    psContext.shaded[1],
+                    psContext.shaded[0].w,
+                    sample,
+                    pColorSample,
+                    blendOut,
+                    &psContext.oMask,
+                    (simdscalari*)&coverageMask);
+            }
         }
 
         // final write mask 
@@ -782,9 +787,6 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
         simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-        // pfnBlendFunc may not update all channels.  Initialize with PS output.
-        /// TODO: move this into the blend JIT.
-        blendOut = psContext.shaded[rt];
 
         if (colorBufferBit & colorBufferEnableMask)
         {
@@ -794,19 +796,25 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
             blendSrc[3] = pColorSample[6];
         }
 
-        // Blend outputs and update coverage mask for alpha test
-        if (pfnBlendFunc[rt] != nullptr)
         {
-            pfnBlendFunc[rt](
-                pBlendState,
-                psContext.shaded[rt],
-                psContext.shaded[1],
-                psContext.shaded[0].w,
-                sample,
-                reinterpret_cast<uint8_t *>(&blendSrc),
-                blendOut,
-                &psContext.oMask,
-                reinterpret_cast<simdscalari *>(&coverageMask));
+            // pfnBlendFunc may not update all channels.  Initialize with PS output.
+            /// TODO: move this into the blend JIT.
+            blendOut = psContext.shaded[rt];
+
+            // Blend outputs and update coverage mask for alpha test
+            if(pfnBlendFunc[rt] != nullptr)
+            {
+                pfnBlendFunc[rt](
+                    pBlendState,
+                    psContext.shaded[rt],
+                    psContext.shaded[1],
+                    psContext.shaded[0].w,
+                    sample,
+                    reinterpret_cast<uint8_t *>(&blendSrc),
+                    blendOut,
+                    &psContext.oMask,
+                    reinterpret_cast<simdscalari *>(&coverageMask));
+            }
         }
 
         // final write mask 
@@ -840,6 +848,9 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
 template<typename T>
 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
+
+
     SWR_CONTEXT *pContext = pDC->pContext;
 
     AR_BEGIN(BEPixelRateBackend, pDC->drawId);
@@ -850,12 +861,12 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     BarycentricCoeffs coeffs;
     SetupBarycentricCoeffs(&coeffs, work);
 
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
     SWR_PS_CONTEXT psContext;
     SetupPixelShaderContext<T>(&psContext, work);
 
+    uint8_t *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
     AR_END(BESetup, 0);
 
     PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
@@ -975,10 +986,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 
                 // broadcast the results of the PS to all passing pixels
 #if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
-#endif
+                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else // USE_8x2_TILE_BACKEND
+                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+#endif // USE_8x2_TILE_BACKEND
 
                 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
                 {
@@ -1009,13 +1020,13 @@ Endtile:
             {
                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
                 {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
             for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
             {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
@@ -1035,7 +1046,8 @@ Endtile:
 }
 
 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
-         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
+         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
+    >
 struct SwrBackendTraits
 {
     static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
index 05347dce98670f43d05f327b2207ebb1cf7b8b51..bb1336c429fd0c2078c713bd31138b02bd2b4c1e 100644 (file)
@@ -330,6 +330,8 @@ struct SWR_PS_CONTEXT
 
     uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer
 
+    uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS];
+                                       // IN: Pointers to render target hottiles
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -511,6 +513,7 @@ struct SWR_SURFACE_STATE
     uint8_t *pAuxBaseAddress;   // Used for compression, append/consume counter, etc.
     SWR_AUX_MODE auxMode;      // @llvm_enum
 
+
     bool bInterleavedSamples;   // are MSAA samples stored interleaved or planar
 };
 
@@ -1087,7 +1090,6 @@ struct SWR_PS_STATE
     uint32_t barycentricsMask   : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
     uint32_t usesUAV            : 1;    // pixel shader accesses UAV 
     uint32_t forceEarlyZ        : 1;    // force execution of early depth/stencil test
-
 };
 
 // depth bounds state