swr/rast: Widen fetch shader to SIMD16

author Tim Rowley <timothy.o.rowley@intel.com>

Thu, 19 Oct 2017 22:33:37 +0000 (17:33 -0500)

committer Tim Rowley <timothy.o.rowley@intel.com>

Mon, 20 Nov 2017 19:50:23 +0000 (13:50 -0600)
author Tim Rowley <timothy.o.rowley@intel.com>
Thu, 19 Oct 2017 22:33:37 +0000 (17:33 -0500)
committer Tim Rowley <timothy.o.rowley@intel.com>
Mon, 20 Nov 2017 19:50:23 +0000 (13:50 -0600)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp

index 6a33ec265fc90878216f036e2ff9ac6436d64ff2..4b83a3204cf7c3e4d2223b9368af26eedfedc1d0 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -41,6 +41,9 @@ namespace SwrJit
          : mpJitMgr(pJitMgr)
      {
          mVWidth = pJitMgr->mVWidth;
+#if USE_SIMD16_BUILDER
+        mVWidth2 = pJitMgr->mVWidth * 2;
+#endif
  
          mpIRBuilder = &pJitMgr->mBuilder;
  
@@ -65,17 +68,34 @@ namespace SwrJit
          mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
          mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
          mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+#if USE_SIMD16_BUILDER
+        mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
+        mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
+        mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
+        mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
+        mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
+        mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
+        mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
+        mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
+#endif
  
          if (sizeof(uint32_t*) == 4)
          {
              mIntPtrTy = mInt32Ty;
              mSimdIntPtrTy = mSimdInt32Ty;
+#if USE_SIMD16_BUILDER
+            mSimd2IntPtrTy = mSimd2Int32Ty;
+#endif
          }
          else
          {
              SWR_ASSERT(sizeof(uint32_t*) == 8);
+
              mIntPtrTy = mInt64Ty;
              mSimdIntPtrTy = mSimdInt64Ty;
+#if USE_SIMD16_BUILDER
+            mSimd2IntPtrTy = mSimd2Int64Ty;
+#endif
          }
      }
  }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h

index 8210e49b18527f422b6f376f9597384f396a6ebd..c6ab64e06e8a2e9d67211fe8db35594af154a7d2 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -32,6 +32,8 @@
  #include "JitManager.h"
  #include "common/formats.h"
  
+#define USE_SIMD16_BUILDER 0
+
  namespace SwrJit
  {
      using namespace llvm;
@@ -45,6 +47,9 @@ namespace SwrJit
          IRBuilder<>* mpIRBuilder;
  
          uint32_t             mVWidth;
+#if USE_SIMD16_BUILDER
+        uint32_t             mVWidth2;
+#endif
  
          // Built in types.
          Type*                mVoidTy;
@@ -70,6 +75,17 @@ namespace SwrJit
          Type*                mSimdIntPtrTy;
          Type*                mSimdVectorTy;
          Type*                mSimdVectorTRTy;
+#if USE_SIMD16_BUILDER
+        Type*                mSimd2FP16Ty;
+        Type*                mSimd2FP32Ty;
+        Type*                mSimd2Int1Ty;
+        Type*                mSimd2Int16Ty;
+        Type*                mSimd2Int32Ty;
+        Type*                mSimd2Int64Ty;
+        Type*                mSimd2IntPtrTy;
+        Type*                mSimd2VectorTy;
+        Type*                mSimd2VectorTRTy;
+#endif
  
  #include "gen_builder.hpp"
  #include "gen_builder_x86.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

index 9ca36b2467152fbbbf1200d3b58c86b79b1ea791..daa9cb1ec119063c6cdfb784ccf7fb71ddd4d66b 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -231,6 +231,13 @@ namespace SwrJit
          return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
      }
  
+#if USE_SIMD16_BUILDER
+    Value *Builder::VUNDEF2_F()
+    {
+        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
+    }
+
+#endif
      Value *Builder::VUNDEF(Type* t)
      {
          return UndefValue::get(VectorType::get(t, mVWidth));
@@ -690,6 +697,51 @@ namespace SwrJit
          return vGather;
      }
  
+#if USE_SIMD16_BUILDER
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief
+    Value *Builder::EXTRACT(Value *a2, uint32_t imm)
+    {
+        const uint32_t i0 = (imm > 0) ? mVWidth : 0;
+
+        Value *result = VUNDEF_F();
+
+        for (uint32_t i = 0; i < mVWidth; i += 1)
+        {
+            Value *temp = VEXTRACT(a2, C(i0 + i));
+
+            result = VINSERT(result, temp, C(i));
+        }
+
+        return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief
+    Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm)
+    {
+        const uint32_t i0 = (imm > 0) ? mVWidth : 0;
+
+        Value *result = BITCAST(a2, mSimd2FP32Ty);
+
+        for (uint32_t i = 0; i < mVWidth; i += 1)
+        {
+#if 1
+            if (!b->getType()->getScalarType()->isFloatTy())
+            {
+                b = BITCAST(b, mSimdFP32Ty);
+            }
+
+#endif
+            Value *temp = VEXTRACT(b, C(i));
+
+            result = VINSERT(result, temp, C(i0 + i));
+        }
+
+        return result;
+    }
+
+#endif
      //////////////////////////////////////////////////////////////////////////
      /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
      Value* Builder::MASK(Value* vmask)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h

index 662574d638f9aa3ec4c33674d25a1e28318a64af..d9ff4a215674675638b5d53dee744b375be80366 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -56,6 +56,9 @@ Value *VIMMED1(float i);
  Value *VIMMED1(bool i);
  Value *VUNDEF(Type* t);
  Value *VUNDEF_F();
+#if USE_SIMD16_BUILDER
+Value *VUNDEF2_F();
+#endif
  Value *VUNDEF_I();
  Value *VUNDEF(Type* ty, uint32_t size);
  Value *VUNDEF_IPTR();
@@ -98,6 +101,12 @@ Value *VMASK(Value* mask);
  /// @brief functions that build IR to call x86 intrinsics directly, or
  /// emulate them with other instructions if not available on the host
  //////////////////////////////////////////////////////////////////////////
+
+#if USE_SIMD16_BUILDER
+Value *EXTRACT(Value *a, uint32_t imm);
+Value *INSERT(Value *a, Value *b, uint32_t imm);
+
+#endif
  Value *MASKLOADD(Value* src, Value* mask);
  
  void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

index 30dbcfc8ce1232545336882ce728fc817193268c..062852e2d22a6aa9be91819baaf0158ea2173f8e 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -80,6 +80,9 @@ struct FetchJit : public Builder
  #endif
  
      void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+#if USE_SIMD16_BUILDER
+    void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+#endif
  
  #if USE_SIMD16_SHADERS
      Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
@@ -137,8 +140,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
      // GEP
      pVtxOut = GEP(pVtxOut, C(0));
  #if USE_SIMD16_SHADERS
-#if 0
-    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0));
+#if 0// USE_SIMD16_BUILDER
+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
  #else
      pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
  #endif
@@ -1250,9 +1253,27 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
  
                              if (currentVertexElement > 3)
                              {
+#if USE_SIMD16_BUILDER
+                                Value *pVtxSrc2[4];
+
+                                // pack adjacent pairs of SIMD8s into SIMD16s
+                                for (uint32_t i = 0; i < 4; i += 1)
+                                {
+                                    pVtxSrc2[i] = VUNDEF2_F();
+
+                                    pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i],  0);
+                                    pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1);
+                                }
+
+                                // store SIMD16s
+                                Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+                                StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
                                  StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
                                  StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
  
+#endif
                                  outputElt += 1;
  
                                  // reset to the next vVertexElement to output
@@ -2312,7 +2333,8 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con
      for(uint32_t c = 0; c < numEltsToStore; ++c)
      {
          // STORE expects FP32 x vWidth type, just bitcast if needed
-        if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
+        if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
+        {
  #if FETCH_DUMP_VERTEX
              PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
  #endif
@@ -2335,6 +2357,35 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con
      }
  }
  
+#if USE_SIMD16_BUILDER
+void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+{
+    SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
+
+    for (uint32_t c = 0; c < numEltsToStore; ++c)
+    {
+        // STORE expects FP32 x vWidth type, just bitcast if needed
+        if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
+        {
+#if FETCH_DUMP_VERTEX
+            PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
+#endif
+            vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
+        }
+#if FETCH_DUMP_VERTEX
+        else
+        {
+            PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
+        }
+#endif
+        // outputElt * 4 = offsetting by the size of a simdvertex
+        // + c offsets to a 32bit x vWidth row within the current vertex
+        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
+        STORE(vVertexElements[c], dest);
+    }
+}
+
+#endif
  //////////////////////////////////////////////////////////////////////////
  /// @brief Generates a constant vector of values based on the 
  /// ComponentControl value
author	Tim Rowley <timothy.o.rowley@intel.com>
	Thu, 19 Oct 2017 22:33:37 +0000 (17:33 -0500)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Mon, 20 Nov 2017 19:50:23 +0000 (13:50 -0600)
src/gallium/drivers/swr/rasterizer/jitter/builder.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp		patch \| blob \| history