swr: [rasterizer core] split FE and BE stats
authorTim Rowley <timothy.o.rowley@intel.com>
Sun, 7 Aug 2016 02:10:14 +0000 (20:10 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Wed, 10 Aug 2016 16:08:51 +0000 (11:08 -0500)
Separated FE stats out into its own structure.  There are 17 FE vs 3 BE
stat fields.  Since there is only one FE thread per DC then we don't have
to loop over all threads and sum up FE stats over all the worker threads.
This also reduces size of DC since we only need to store one copy of the
FE stats and not one per worker.  Finally, we can use the new FE callback
mechanism to update these.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/api.h
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/state.h
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/swr_context.cpp
src/gallium/drivers/swr/swr_context.h
src/gallium/drivers/swr/swr_query.cpp
src/gallium/drivers/swr/swr_query.h

index 0797c8ac86b70932eec66ae74db048682a7a9395..d6aa80d678f615acbe157247920300ecbd8aa7af 100644 (file)
@@ -144,6 +144,7 @@ HANDLE SwrCreateContext(
     pContext->pfnClearTile = pCreateInfo->pfnClearTile;
     pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
     pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
+    pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
 
     // pass pointer to bucket manager back to caller
 #ifdef KNOB_ENABLE_RDTSC
index 4ee04dc1368f70ab6d6f744e8b670b05747980ee..ed18fe01010b59bbba9b66c09eff5e594d4fa196 100644 (file)
@@ -95,6 +95,16 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
 typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext,
     const SWR_STATS* pStats);
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Callback to allow driver to update their copy of FE stats.
+/// @note Its optimal to have a separate callback for FE stats since
+///       there is only one DC per FE thread. This means we do not have
+///       to sum up the stats across all of the workers.
+/// @param hPrivateContext - handle to private data
+/// @param pStats - pointer to draw stats
+typedef void(SWR_API *PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext,
+    const SWR_STATS_FE* pStats);
+
 class BucketManager;
 
 //////////////////////////////////////////////////////////////////////////
@@ -121,11 +131,12 @@ struct SWR_CREATECONTEXT_INFO
     uint32_t privateStateSize;
 
     // Callback functions
-    PFN_LOAD_TILE pfnLoadTile;
-    PFN_STORE_TILE pfnStoreTile;
-    PFN_CLEAR_TILE pfnClearTile;
-    PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS pfnUpdateStats;
+    PFN_LOAD_TILE               pfnLoadTile;
+    PFN_STORE_TILE              pfnStoreTile;
+    PFN_CLEAR_TILE              pfnClearTile;
+    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
+    PFN_UPDATE_STATS            pfnUpdateStats;
+    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
 
     // Pointer to rdtsc buckets mgr returned to the caller.
     // Only populated when KNOB_ENABLE_RDTSC is set
index b2b3bb4e6fd406357ff621869ce9f5135a4f7994..a2ba76967fe7b335f6f81cf20d361690d6d9dc0b 100644 (file)
@@ -495,7 +495,7 @@ public:
 
         // update global pipeline stat
         SWR_CONTEXT* pContext = this->pDC->pContext;
-        UPDATE_STAT(CPrimitives, numClippedPrims);
+        UPDATE_STAT_FE(CPrimitives, numClippedPrims);
     }
     
     // execute the clipper stage
@@ -523,7 +523,7 @@ public:
         // update clipper invocations pipeline stat
         SWR_CONTEXT* pContext = this->pDC->pContext;
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
-        UPDATE_STAT(CInvocations, numInvoc);
+        UPDATE_STAT_FE(CInvocations, numInvoc);
 
         ComputeClipCodes(prim);
 
@@ -559,7 +559,7 @@ public:
         {
             // update CPrimitives pipeline state
             SWR_CONTEXT* pContext = this->pDC->pContext;
-            UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
+            UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
             pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
index c478ee9c261646d65afd52309ac0b97df5d85da6..144fcefb208555bd1b43b0b394fd09248a9fed1f 100644 (file)
@@ -365,7 +365,8 @@ struct DRAW_DYNAMIC_STATE
     uint32_t SoWriteOffset[4];
     bool     SoWriteOffsetDirty[4];
 
-    SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+    SWR_STATS_FE statsFE;   // Only one FE thread per DC.
+    SWR_STATS    stats[KNOB_MAX_NUM_THREADS];
 };
 
 // Draw Context
@@ -470,11 +471,12 @@ struct SWR_CONTEXT
     HotTileMgr *pHotTileMgr;
 
     // Callback functions, passed in at create context time
-    PFN_LOAD_TILE pfnLoadTile;
-    PFN_STORE_TILE pfnStoreTile;
-    PFN_CLEAR_TILE pfnClearTile;
-    PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS pfnUpdateStats;
+    PFN_LOAD_TILE               pfnLoadTile;
+    PFN_STORE_TILE              pfnStoreTile;
+    PFN_CLEAR_TILE              pfnClearTile;
+    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
+    PFN_UPDATE_STATS            pfnUpdateStats;
+    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
 
     // Global Stats
     SWR_STATS stats[KNOB_MAX_NUM_THREADS];
@@ -492,3 +494,4 @@ void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
 void WakeAllThreads(SWR_CONTEXT *pContext);
 
 #define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; }
+#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
index e32f743de7e20f336ab1baa617349deb4fa3d1a7..3014c7defc8ce1ef840f8544cb5ac21fc95ce749 100644 (file)
@@ -580,8 +580,8 @@ static void StreamOut(
         }
     }
 
-    UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
-    UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+    UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
+    UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 
     RDTSC_STOP(FEStreamout, 1, 0);
 }
@@ -843,8 +843,8 @@ static void GeometryShaderStage(
     }
 
     // update GS pipeline stats
-    UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
-    UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
+    UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
+    UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 
     RDTSC_STOP(FEGeometryShader, 1, 0);
 }
@@ -1009,7 +1009,7 @@ static void TessellationStages(
     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
     RDTSC_STOP(FEHullShader, 0, 0);
 
-    UPDATE_STAT(HsInvocations, numPrims);
+    UPDATE_STAT_FE(HsInvocations, numPrims);
 
     const uint32_t* pPrimId = (const uint32_t*)&primID;
 
@@ -1065,7 +1065,7 @@ static void TessellationStages(
 
             dsInvocations += KNOB_SIMD_WIDTH;
         }
-        UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
+        UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
 
         PA_TESS tessPa(
             pDC,
@@ -1302,7 +1302,7 @@ void ProcessDraw(
                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
                 }
 
-                UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
+                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
 
 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
@@ -1312,7 +1312,7 @@ void ProcessDraw(
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
                     RDTSC_STOP(FEVertexShader, 0, 0);
 
-                    UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
+                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
             }
 
@@ -1335,7 +1335,7 @@ void ProcessDraw(
                     {
                         if (assemble)
                         {
-                            UPDATE_STAT(IaPrimitives, pa.NumPrims());
+                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
 
                             if (HasTessellationT::value)
                             {
index fdf5d7ef45cf83f3c3dd360f1f859c4aa25e80e4..988de75f4d519dc7d86f32285b39fb73f159d837 100644 (file)
@@ -564,17 +564,27 @@ struct SWR_STATS
     uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
 
     // Pipeline Stats
+    uint64_t PsInvocations;  // Number of Pixel Shader invocations
+    uint64_t CsInvocations;  // Number of Compute Shader invocations
+
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_STATS
+///
+/// @brief All statistics generated by FE.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STATS_FE
+{
     uint64_t IaVertices;    // Number of Fetch Shader vertices
     uint64_t IaPrimitives;  // Number of PA primitives.
     uint64_t VsInvocations; // Number of Vertex Shader invocations
     uint64_t HsInvocations; // Number of Hull Shader invocations
     uint64_t DsInvocations; // Number of Domain Shader invocations
     uint64_t GsInvocations; // Number of Geometry Shader invocations
-    uint64_t PsInvocations; // Number of Pixel Shader invocations
-    uint64_t CsInvocations; // Number of Compute Shader invocations
+    uint64_t GsPrimitives;  // Number of prims GS outputs.
     uint64_t CInvocations;  // Number of clipper invocations
     uint64_t CPrimitives;   // Number of clipper primitives.
-    uint64_t GsPrimitives;  // Number of prims GS outputs.
 
     // Streamout Stats
     uint64_t SoPrimStorageNeeded[4];
index fb17af1520318fb5b7c83ddfd7ca0842ad0f685a..dce23b2486e3ebf7e61c0eb0635148936a9a32ba 100644 (file)
@@ -322,23 +322,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
         stats.DepthPassCount += dynState.stats[i].DepthPassCount;
-        stats.IaVertices     += dynState.stats[i].IaVertices;
-        stats.IaPrimitives   += dynState.stats[i].IaPrimitives;
-        stats.VsInvocations  += dynState.stats[i].VsInvocations;
-        stats.HsInvocations  += dynState.stats[i].HsInvocations;
-        stats.DsInvocations  += dynState.stats[i].DsInvocations;
-        stats.GsInvocations  += dynState.stats[i].GsInvocations;
+
         stats.PsInvocations  += dynState.stats[i].PsInvocations;
-        stats.CInvocations   += dynState.stats[i].CInvocations;
         stats.CsInvocations  += dynState.stats[i].CsInvocations;
-        stats.CPrimitives    += dynState.stats[i].CPrimitives;
-        stats.GsPrimitives   += dynState.stats[i].GsPrimitives;
-
-        for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
-        {
-            stats.SoPrimStorageNeeded[stream] += dynState.stats[i].SoPrimStorageNeeded[stream];
-            stats.SoNumPrimsWritten[stream]   += dynState.stats[i].SoNumPrimsWritten[stream];
-        }
     }
 
     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
@@ -560,6 +546,11 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
     _ReadWriteBarrier();
 
+    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
+    {
+        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
+    }
+
     if (pContext->pfnUpdateSoWriteOffset)
     {
         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
index 53d2b93089b4f0ef013f84f9b1e3f68456dba3b3..15e60cddf0a31b6353d080fa98abe3ec125ccaf3 100644 (file)
@@ -355,15 +355,29 @@ swr_UpdateStats(HANDLE hPrivateContext, const SWR_STATS *pStats)
    struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx;
 
    SWR_STATS *pSwrStats = &ctx->stats;
+
    pSwrStats->DepthPassCount += pStats->DepthPassCount;
+   pSwrStats->PsInvocations += pStats->PsInvocations;
+   pSwrStats->CsInvocations += pStats->CsInvocations;
+}
+
+static void
+swr_UpdateStatsFE(HANDLE hPrivateContext, const SWR_STATS_FE *pStats)
+{
+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
+
+   if (!pDC)
+      return;
+
+   struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx;
+
+   SWR_STATS_FE *pSwrStats = &ctx->statsFE;
    pSwrStats->IaVertices += pStats->IaVertices;
    pSwrStats->IaPrimitives += pStats->IaPrimitives;
    pSwrStats->VsInvocations += pStats->VsInvocations;
    pSwrStats->HsInvocations += pStats->HsInvocations;
    pSwrStats->DsInvocations += pStats->DsInvocations;
    pSwrStats->GsInvocations += pStats->GsInvocations;
-   pSwrStats->PsInvocations += pStats->PsInvocations;
-   pSwrStats->CsInvocations += pStats->CsInvocations;
    pSwrStats->CInvocations += pStats->CInvocations;
    pSwrStats->CPrimitives += pStats->CPrimitives;
    pSwrStats->GsPrimitives += pStats->GsPrimitives;
@@ -389,6 +403,7 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
    createInfo.pfnStoreTile = swr_StoreHotTile;
    createInfo.pfnClearTile = swr_StoreHotTileClear;
    createInfo.pfnUpdateStats = swr_UpdateStats;
+   createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
    ctx->swrContext = SwrCreateContext(&createInfo);
 
    /* Init Load/Store/ClearTiles Tables */
index 4133720cbf03c12dcc7c96251f0050111ce7fc60..b4553fb171bf1046d0d3f027274d08f4d06b7ec5 100644 (file)
@@ -159,6 +159,7 @@ struct swr_context {
    struct swr_draw_context swrDC;
 
    SWR_STATS stats;
+   SWR_STATS_FE statsFE;
 
    unsigned dirty; /**< Mask of SWR_NEW_x flags */
 };
index 35d0e53fb2375c5d62e8823f39a8be7ba8ecf889..c51c529e5f3146d76ee1d5c866d13ac9adc05fa5 100644 (file)
@@ -94,6 +94,7 @@ swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq)
       /* TODO: should fence instead of stalling pipeline */
       SwrWaitForIdle(ctx->swrContext);
       memcpy(&result->core, &ctx->stats, sizeof(result->core));
+      memcpy(&result->coreFE, &ctx->statsFE, sizeof(result->coreFE));
 
 #if 0
       if (!pq->fence) {
@@ -150,17 +151,17 @@ swr_get_query_result(struct pipe_context *pipe,
       result->u64 = end->timestamp - start->timestamp;
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
-      result->u64 = end->core.IaPrimitives - start->core.IaPrimitives;
+      result->u64 = end->coreFE.IaPrimitives - start->coreFE.IaPrimitives;
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      result->u64 = end->core.SoNumPrimsWritten[index]
-         - start->core.SoNumPrimsWritten[index];
+      result->u64 = end->coreFE.SoNumPrimsWritten[index]
+         - start->coreFE.SoNumPrimsWritten[index];
       break;
    /* Structures */
    case PIPE_QUERY_SO_STATISTICS: {
       struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
-      struct SWR_STATS *start = &pq->start.core;
-      struct SWR_STATS *end = &pq->end.core;
+      struct SWR_STATS_FE *start = &pq->start.coreFE;
+      struct SWR_STATS_FE *end = &pq->end.coreFE;
       so_stats->num_primitives_written =
          end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index];
       so_stats->primitives_storage_needed =
@@ -176,21 +177,23 @@ swr_get_query_result(struct pipe_context *pipe,
          &result->pipeline_statistics;
       struct SWR_STATS *start = &pq->start.core;
       struct SWR_STATS *end = &pq->end.core;
-      p_stats->ia_vertices = end->IaVertices - start->IaVertices;
-      p_stats->ia_primitives = end->IaPrimitives - start->IaPrimitives;
-      p_stats->vs_invocations = end->VsInvocations - start->VsInvocations;
-      p_stats->gs_invocations = end->GsInvocations - start->GsInvocations;
-      p_stats->gs_primitives = end->GsPrimitives - start->GsPrimitives;
-      p_stats->c_invocations = end->CPrimitives - start->CPrimitives;
-      p_stats->c_primitives = end->CPrimitives - start->CPrimitives;
+      struct SWR_STATS_FE *startFE = &pq->start.coreFE;
+      struct SWR_STATS_FE *endFE = &pq->end.coreFE;
+      p_stats->ia_vertices = endFE->IaVertices - startFE->IaVertices;
+      p_stats->ia_primitives = endFE->IaPrimitives - startFE->IaPrimitives;
+      p_stats->vs_invocations = endFE->VsInvocations - startFE->VsInvocations;
+      p_stats->gs_invocations = endFE->GsInvocations - startFE->GsInvocations;
+      p_stats->gs_primitives = endFE->GsPrimitives - startFE->GsPrimitives;
+      p_stats->c_invocations = endFE->CPrimitives - startFE->CPrimitives;
+      p_stats->c_primitives = endFE->CPrimitives - startFE->CPrimitives;
       p_stats->ps_invocations = end->PsInvocations - start->PsInvocations;
-      p_stats->hs_invocations = end->HsInvocations - start->HsInvocations;
-      p_stats->ds_invocations = end->DsInvocations - start->DsInvocations;
+      p_stats->hs_invocations = endFE->HsInvocations - startFE->HsInvocations;
+      p_stats->ds_invocations = endFE->DsInvocations - startFE->DsInvocations;
       p_stats->cs_invocations = end->CsInvocations - start->CsInvocations;
     } break;
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
-      struct SWR_STATS *start = &pq->start.core;
-      struct SWR_STATS *end = &pq->end.core;
+      struct SWR_STATS_FE *start = &pq->start.coreFE;
+      struct SWR_STATS_FE *end = &pq->end.coreFE;
       uint64_t num_primitives_written =
          end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index];
       uint64_t primitives_storage_needed =
index 0ab034d397f7c225c8d93bfadfb811defeefa1cb..931d687b0052e43a05dd4fe11fd29e1abb71d7e7 100644 (file)
@@ -29,6 +29,7 @@
 
 struct swr_query_result {
    SWR_STATS core;
+   SWR_STATS_FE coreFE;
    uint64_t timestamp;
 };