swr/rast: SIMD16 FE - interleaved simdvertex output in GS
[mesa.git] / src / gallium / drivers / swr / swr_shader.cpp
index d8f551291cd1bb63ef0707bfbd0b1ee1e256252a..2f495f59c234f1f216822f7042eecbb6595f7c7a 100644 (file)
 #include "swr_state.h"
 #include "swr_screen.h"
 
-#if HAVE_LLVM < 0x0500
-namespace llvm {
-typedef AttributeSet AttributeList;
-}
-#endif
-
 using namespace SwrJit;
 using namespace llvm;
 
@@ -226,6 +220,9 @@ struct BuilderSWR : public Builder {
       gallivm_free_ir(gallivm);
    }
 
+   void WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput,
+                unsigned slot, unsigned channel);
+
    struct gallivm_state *gallivm;
    PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key);
    PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key);
@@ -373,8 +370,13 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
 
     IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
 
+#if USE_SIMD16_FRONTEND
+    const uint32_t simdVertexStride = sizeof(simdvertex) * 2;
+    const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);
+#else
     const uint32_t simdVertexStride = sizeof(simdvertex);
-    const uint32_t numSimdBatches = (pGS->maxNumVerts + 7) / 8;
+    const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;
+#endif
     const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;
 
     Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });
@@ -391,8 +393,14 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
           inputPrimStride * 6,
           inputPrimStride * 7 } );
 
-    Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), 3);
-    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), 7);
+#if USE_SIMD16_FRONTEND
+    const uint32_t simdShift = log2(mVWidth * 2);
+    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);
+#else
+    const uint32_t simdShift = log2(mVWidth);
+    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);
+#endif
+    Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);
 
     for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
        uint32_t attribSlot = attrib;
@@ -403,10 +411,17 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
        else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER)
           attribSlot = VERTEX_RTAI_SLOT;
 
+#if USE_SIMD16_FRONTEND
+       Value *vOffsetsAttrib =
+          ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));
+       vOffsetsAttrib =
+          ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));
+#else
        Value *vOffsetsAttrib =
           ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));
        vOffsetsAttrib =
           ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));
+#endif
        vOffsetsAttrib =
           ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
 
@@ -419,8 +434,13 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
 
           MASKED_SCATTER(vData, vPtrs, 32, vMask1);
 
+#if USE_SIMD16_FRONTEND
+          vOffsetsAttrib =
+             ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));
+#else
           vOffsetsAttrib =
              ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));
+#endif
        }
     }
 }
@@ -530,8 +550,6 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
 
    AttrBuilder attrBuilder;
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-   AttributeList attrSet = AttributeList::get(
-      JM()->mContext, AttributeList::FunctionIndex, attrBuilder);
 
    std::vector<Type *> gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
                               PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)};
@@ -543,7 +561,13 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
                                      GlobalValue::ExternalLinkage,
                                      "GS",
                                      JM()->mpCurrentModule);
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrSet);
+#if HAVE_LLVM < 0x0500
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+#else
+   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
+#endif
 
    BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
    IRB()->SetInsertPoint(block);
@@ -657,6 +681,23 @@ swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key)
    return func;
 }
 
+void
+BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel)
+{
+#if USE_SIMD16_FRONTEND
+   // interleave the simdvertex components into the dest simd16vertex
+   //   slot16offset = slot8offset * 2
+   //   comp16offset = comp8offset * 2 + alternateOffset
+
+   Value *offset = LOAD(pVsContext, { 0, SWR_VS_CONTEXT_AlternateOffset });
+   Value *pOut = GEP(pVtxOutput, { C(0), C(0), C(slot * 2), offset } );
+   STORE(pVal, pOut, {channel * 2});
+#else
+   Value *pOut = GEP(pVtxOutput, {0, 0, slot});
+   STORE(pVal, pOut, {0, channel});
+#endif
+}
+
 PFN_VERTEX_FUNC
 BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
 {
@@ -669,8 +710,6 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
 
    AttrBuilder attrBuilder;
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-   AttributeList attrSet = AttributeList::get(
-      JM()->mContext, AttributeList::FunctionIndex, attrBuilder);
 
    std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
                               PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)};
@@ -682,7 +721,13 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
                                      GlobalValue::ExternalLinkage,
                                      "VS",
                                      JM()->mpCurrentModule);
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrSet);
+#if HAVE_LLVM < 0x0500
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+#else
+   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
+#endif
 
    BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
    IRB()->SetInsertPoint(block);
@@ -752,7 +797,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
          uint32_t outSlot = attrib;
          if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE)
             outSlot = VERTEX_POINT_SIZE_SLOT;
-         STORE(val, vtxOutput, {0, 0, outSlot, channel});
+         WriteVS(val, pVsCtx, vtxOutput, outSlot, channel);
       }
    }
 
@@ -786,10 +831,10 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
                                              &swr_vs->info.base);
             if (val < 4) {
                LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val], "");
-               STORE(unwrap(dist), vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_LO_SLOT, val});
+               WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
             } else {
                LLVMValueRef dist = LLVMBuildLoad(gallivm->builder, outputs[cv][val - 4], "");
-               STORE(unwrap(dist), vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4});
+               WriteVS(unwrap(dist), pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4);
             }
             continue;
          }
@@ -807,9 +852,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
                                       FMUL(unwrap(cw), VBROADCAST(pw)))));
 
          if (val < 4)
-            STORE(dist, vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_LO_SLOT, val});
+            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
          else
-            STORE(dist, vtxOutput, {0, 0, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4});
+            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_HI_SLOT, val - 4);
       }
    }
 
@@ -880,8 +925,6 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
 
    AttrBuilder attrBuilder;
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-   AttributeList attrSet = AttributeList::get(
-      JM()->mContext, AttributeList::FunctionIndex, attrBuilder);
 
    std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
                               PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)};
@@ -892,7 +935,13 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
                                      GlobalValue::ExternalLinkage,
                                      "FS",
                                      JM()->mpCurrentModule);
-   pFunction->addAttributes(AttributeList::FunctionIndex, attrSet);
+#if HAVE_LLVM < 0x0500
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+#else
+   pFunction->addAttributes(AttributeList::FunctionIndex, attrBuilder);
+#endif
 
    BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
    IRB()->SetInsertPoint(block);