swr/rast: Rework attribute layout
authorTim Rowley <timothy.o.rowley@intel.com>
Thu, 1 Jun 2017 18:08:04 +0000 (13:08 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Fri, 16 Jun 2017 21:20:16 +0000 (16:20 -0500)
Move fixed attributes to the top and pack single component SGVs.
WIP to support dynamically allocated vertex size.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/binner.cpp
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/state.h
src/gallium/drivers/swr/swr_shader.cpp

index 06078cdcfa1a27dc169b4bc389a02732fb9c5759..4f8498d8b9735e44dd3356d9a1adffbd3cd67355 100644 (file)
@@ -766,9 +766,9 @@ endBinTriangles:
     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
     {
         simdvector vRtai[3];
-        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+        pa.Assemble(VERTEX_SGV_SLOT, vRtai);
         simdscalari vRtaii;
-        vRtaii = _simd_castps_si(vRtai[0].x);
+        vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd_store_si((simdscalari*)aRTAI, vRtaii);
     }
     else
@@ -1114,10 +1114,14 @@ void SIMDAPI BinTriangles_simd16(
         scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
     }
 
+    // Make triangle bbox inclusive
+    bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+    bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+
     bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
     bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+    bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+    bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
 
     if (CT::IsConservativeT::value)
     {
@@ -1212,9 +1216,9 @@ endBinTriangles:
     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
     {
         simd16vector vRtai[3];
-        pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
         simd16scalari vRtaii;
-        vRtaii = _simd16_castps_si(vRtai[0].x);
+        vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
     }
     else
@@ -1422,8 +1426,8 @@ void BinPostSetupPoints(
         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
         {
             simdvector vRtai;
-            pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
-            simdscalari vRtaii = _simd_castps_si(vRtai.x);
+            pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
+            simdscalari vRtaii = _simd_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
             _simd_store_si((simdscalari*)aRTAI, vRtaii);
         }
         else
@@ -1496,8 +1500,8 @@ void BinPostSetupPoints(
         if (rastState.pointParam)
         {
             simdvector size[3];
-            pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
-            vPointSize = size[0].x;
+            pa.Assemble(VERTEX_SGV_SLOT, size);
+            vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
         }
         else
         {
@@ -1562,8 +1566,8 @@ void BinPostSetupPoints(
         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
         {
             simdvector vRtai[2];
-            pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-            simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+            pa.Assemble(VERTEX_SGV_SLOT, vRtai);
+            simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
             _simd_store_si((simdscalari*)aRTAI, vRtaii);
         }
         else
@@ -1792,8 +1796,8 @@ void BinPostSetupPoints_simd16(
         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
         {
             simd16vector vRtai;
-            pa.Assemble_simd16(VERTEX_RTAI_SLOT, &vRtai);
-            simd16scalari vRtaii = _simd16_castps_si(vRtai.x);
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, &vRtai);
+            simd16scalari vRtaii = _simd16_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
             _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
         }
         else
@@ -1868,8 +1872,8 @@ void BinPostSetupPoints_simd16(
         if (rastState.pointParam)
         {
             simd16vector size[3];
-            pa.Assemble_simd16(VERTEX_POINT_SIZE_SLOT, size);
-            vPointSize = size[0].x;
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, size);
+            vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
         }
         else
         {
@@ -1937,8 +1941,8 @@ void BinPostSetupPoints_simd16(
         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
         {
             simd16vector vRtai[2];
-            pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
-            simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
+            simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
             _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
         }
         else
@@ -2218,8 +2222,8 @@ void BinPostSetupLines(
     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
     {
         simdvector vRtai[2];
-        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-        simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+        pa.Assemble(VERTEX_SGV_SLOT, vRtai);
+        simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd_store_si((simdscalari*)aRTAI, vRtaii);
     }
     else
@@ -2435,8 +2439,8 @@ void BinPostSetupLines_simd16(
     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
     {
         simd16vector vRtai[2];
-        pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
-        simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
+        simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
         _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
     }
     else
index e2660c1cc905533396a199fe4ea73dec42c52098..676a4456575cbb901947c26d4bfb799655b0459a 100644 (file)
@@ -955,14 +955,13 @@ static void GeometryShaderStage(
                                 if (state.gsState.emitsViewportArrayIndex)
                                 {
                                     simd16vector vpiAttrib[3];
-                                    gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+                                    gsPa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
 
                                     // OOB indices => forced to zero.
+                                    simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
                                     simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
-                                    vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
-
-                                    vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
+                                    simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+                                    vViewPortIdx = _simd16_and_si(vClearMask, vpai);
                                 }
                                 else
                                 {
@@ -979,14 +978,13 @@ static void GeometryShaderStage(
                                 if (state.gsState.emitsViewportArrayIndex)
                                 {
                                     simdvector vpiAttrib[3];
-                                    gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+                                    gsPa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+                                    simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 
                                     // OOB indices => forced to zero.
                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
-                                    vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
-
-                                    vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
+                                    simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+                                    vViewPortIdx = _simd_and_si(vClearMask, vpai);
                                 }
                                 else
                                 {
index f70305114742684b71a4295f2f4cc3053ed791ea..4c0c1db412cd9800130d8ae66bd34c60d5aa2323 100644 (file)
@@ -175,22 +175,21 @@ enum SWR_OUTER_TESSFACTOR_ID
 /////////////////////////////////////////////////////////////////////////
 /// simdvertex
 /// @brief Defines a vertex element that holds all the data for SIMD vertices.
-///        Contains position in clip space, hardcoded to attribute 0,
-///        space for up to 32 attributes, as well as any SGV values generated
-///        by the pipeline
+///        Contains space for position, SGV, and 32 generic attributes
 /////////////////////////////////////////////////////////////////////////
 enum SWR_VTX_SLOTS
 {
-    VERTEX_POSITION_SLOT             = 0,
-    VERTEX_POSITION_END_SLOT         = 0,
-    VERTEX_ATTRIB_START_SLOT         = ( 1 + VERTEX_POSITION_END_SLOT),
-    VERTEX_ATTRIB_END_SLOT           = (32 + VERTEX_POSITION_END_SLOT),
-    VERTEX_RTAI_SLOT                 = (33 + VERTEX_POSITION_END_SLOT), // GS writes RenderTargetArrayIndex here
-    VERTEX_CLIPCULL_DIST_LO_SLOT     = (34 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist
-    VERTEX_CLIPCULL_DIST_HI_SLOT     = (35 + VERTEX_POSITION_END_SLOT), // VS writes upper 4 clip/cull dist
-    VERTEX_POINT_SIZE_SLOT           = (36 + VERTEX_POSITION_END_SLOT), // VS writes point size here
-    VERTEX_VIEWPORT_ARRAY_INDEX_SLOT = (37 + VERTEX_POSITION_END_SLOT),
-    SWR_VTX_NUM_SLOTS                 = VERTEX_VIEWPORT_ARRAY_INDEX_SLOT,
+    VERTEX_SGV_SLOT                 = 0,
+        VERTEX_SGV_RTAI_COMP        = 0,
+        VERTEX_SGV_VAI_COMP         = 1,
+        VERTEX_SGV_POINT_SIZE_COMP  = 2,
+    VERTEX_POSITION_SLOT            = 1,
+    VERTEX_POSITION_END_SLOT        = 1,
+    VERTEX_CLIPCULL_DIST_LO_SLOT    = (1 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist
+    VERTEX_CLIPCULL_DIST_HI_SLOT    = (2 + VERTEX_POSITION_END_SLOT), // VS writes upper 4 clip/cull dist
+    VERTEX_ATTRIB_START_SLOT        = (3 + VERTEX_POSITION_END_SLOT),
+    VERTEX_ATTRIB_END_SLOT          = (34 + VERTEX_POSITION_END_SLOT),
+    SWR_VTX_NUM_SLOTS               = (1 + VERTEX_ATTRIB_END_SLOT)
 };
 
 // SoAoSoA
index 4498889269181c9e6354c3a9c90e2a3a81eb9770..2a772939e2da4a52ae99f161ee5c84ee4d6750a4 100644 (file)
@@ -404,10 +404,18 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
 
     for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
        uint32_t attribSlot = attrib;
-       if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE)
-          attribSlot = VERTEX_POINT_SIZE_SLOT;
-       else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER)
-          attribSlot = VERTEX_RTAI_SLOT;
+       uint32_t sgvChannel = 0;
+       if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) {
+          attribSlot = VERTEX_SGV_SLOT;
+          sgvChannel = VERTEX_SGV_POINT_SIZE_COMP;
+       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) {
+          attribSlot = VERTEX_SGV_SLOT;
+          sgvChannel = VERTEX_SGV_RTAI_COMP;
+       } else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) {
+          attribSlot = VERTEX_POSITION_SLOT;
+       } else {
+          attribSlot = VERTEX_ATTRIB_START_SLOT + attrib - 1;
+       }
 
 #if USE_SIMD16_FRONTEND
        Value *vOffsetsAttrib =
@@ -424,13 +432,21 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
           ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
 
        for (uint32_t channel = 0; channel < 4; ++channel) {
-          Value *vData = LOAD(unwrap(outputs[attrib][channel]));
           Value *vPtrs = GEP(pStream, vOffsetsAttrib);
+          Value *vData;
+
+          if (attribSlot == VERTEX_SGV_SLOT)
+             vData = LOAD(unwrap(outputs[attrib][0]));
+          else
+             vData = LOAD(unwrap(outputs[attrib][channel]));
 
-          vPtrs = BITCAST(vPtrs,
-                          VectorType::get(PointerType::get(mFP32Ty, 0), 8));
+          if (attribSlot != VERTEX_SGV_SLOT ||
+              sgvChannel == channel) {
+             vPtrs = BITCAST(vPtrs,
+                             VectorType::get(PointerType::get(mFP32Ty, 0), 8));
 
-          MASKED_SCATTER(vData, vPtrs, 32, vMask1);
+             MASKED_SCATTER(vData, vPtrs, 32, vMask1);
+          }
 
 #if USE_SIMD16_FRONTEND
           vOffsetsAttrib =
@@ -597,8 +613,15 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
       ubyte semantic_name = info->input_semantic_name[slot];
       ubyte semantic_idx = info->input_semantic_index[slot];
 
-      unsigned vs_slot =
-         locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base) + 1;
+      unsigned vs_slot = locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base);
+
+      vs_slot += VERTEX_ATTRIB_START_SLOT;
+
+      if (ctx->vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+         vs_slot--;
+
+      if (semantic_name == TGSI_SEMANTIC_POSITION)
+         vs_slot = VERTEX_POSITION_SLOT;
 
       STORE(C(vs_slot), vtxAttribMap, {0, slot});
       mapConstants.push_back(C(vs_slot));
@@ -789,11 +812,24 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
          if (!outputs[attrib][channel])
             continue;
 
-         Value *val = LOAD(unwrap(outputs[attrib][channel]));
+         Value *val;
+         uint32_t outSlot;
+
+         if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE) {
+            if (channel != VERTEX_SGV_POINT_SIZE_COMP)
+               continue;
+            val = LOAD(unwrap(outputs[attrib][0]));
+            outSlot = VERTEX_SGV_SLOT;
+         } else if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_POSITION) {
+            val = LOAD(unwrap(outputs[attrib][channel]));
+            outSlot = VERTEX_POSITION_SLOT;
+         } else {
+            val = LOAD(unwrap(outputs[attrib][channel]));
+            outSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+            if (swr_vs->info.base.output_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+               outSlot--;
+         }
 
-         uint32_t outSlot = attrib;
-         if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE)
-            outSlot = VERTEX_POINT_SIZE_SLOT;
          WriteVS(val, pVsCtx, vtxOutput, outSlot, channel);
       }
    }
@@ -804,8 +840,8 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
 
       unsigned cv = 0;
       if (swr_vs->info.base.writes_clipvertex) {
-         cv = 1 + locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0,
-                                 &swr_vs->info.base);
+         cv = locate_linkage(TGSI_SEMANTIC_CLIPVERTEX, 0,
+                             &swr_vs->info.base);
       } else {
          for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
             if (swr_vs->info.base.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
@@ -824,8 +860,8 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
          // clip distance overrides user clip planes
          if ((swr_vs->info.base.clipdist_writemask & clip_mask & (1 << val)) ||
              ((swr_vs->info.base.culldist_writemask << swr_vs->info.base.num_written_clipdistance) & (1 << val))) {
-            unsigned cv = 1 + locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1,
-                                             &swr_vs->info.base);
+            unsigned cv = locate_linkage(TGSI_SEMANTIC_CLIPDIST, val < 4 ? 0 : 1,
+                                         &swr_vs->info.base);
             if (val < 4) {
                LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], "");
                WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
@@ -894,7 +930,7 @@ locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info)
    for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
       if ((info->output_semantic_name[i] == name)
           && (info->output_semantic_index[i] == index)) {
-         return i - 1; // position is not part of the linkage
+         return i;
       }
    }
 
@@ -1043,7 +1079,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
       }
 
       unsigned linkedAttrib =
-         locate_linkage(semantic_name, semantic_idx, pPrevShader);
+         locate_linkage(semantic_name, semantic_idx, pPrevShader) - 1;
 
       uint32_t extraAttribs = 0;
       if (semantic_name == TGSI_SEMANTIC_PRIMID && !ctx->gs) {
@@ -1079,7 +1115,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
       Value *offset = NULL;
       if (semantic_name == TGSI_SEMANTIC_COLOR && key.light_twoside) {
          bcolorAttrib = locate_linkage(
-               TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader);
+               TGSI_SEMANTIC_BCOLOR, semantic_idx, pPrevShader) - 1;
          /* Neither front nor back colors were available. Nothing to load. */
          if (bcolorAttrib == 0xFFFFFFFF && linkedAttrib == 0xFFFFFFFF)
             continue;