swr/rasterizer: Better implementation of scatter

author Jan Zielinski <jan.zielinski@intel.com>

Wed, 24 Jul 2019 10:25:27 +0000 (12:25 +0200)

committer Jan Zielinski <jan.zielinski@intel.com>

Tue, 30 Jul 2019 13:39:19 +0000 (13:39 +0000)
author Jan Zielinski <jan.zielinski@intel.com>
Wed, 24 Jul 2019 10:25:27 +0000 (12:25 +0200)
committer Jan Zielinski <jan.zielinski@intel.com>
Tue, 30 Jul 2019 13:39:19 +0000 (13:39 +0000)
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources

index b298356079e73ffa30c15f99bf5b3c89faa07a13..720bd590d82ef0581e018f917319c4dcc37f697d 100644 (file)
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -156,6 +156,7 @@ JITTER_CXX_SOURCES := \
         rasterizer/jitter/streamout_jit.cpp \
         rasterizer/jitter/streamout_jit.h \
         rasterizer/jitter/shader_lib/DebugOutput.cpp \
+    rasterizer/jitter/shader_lib/Scatter.cpp \
         rasterizer/jitter/functionpasses/passes.h \
         rasterizer/jitter/functionpasses/lower_x86.cpp
  
diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build

index 9e07724d5dcbb629299e839ed47036e4b2c095b3..658747563740f79c7da3a989c928f88890346378 100644 (file)
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -82,6 +82,7 @@ files_swr_mesa = files(
    'rasterizer/jitter/streamout_jit.cpp',
    'rasterizer/jitter/streamout_jit.h',
    'rasterizer/jitter/shader_lib/DebugOutput.cpp',
+  'rasterizer/jitter/shader_lib/Scatter.cpp',
    'rasterizer/jitter/functionpasses/lower_x86.cpp',
    'rasterizer/memory/SurfaceState.h'
  )
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py

index ac685ad51ede38fd13125812c78c896844083500..0cd7ae7a78123d5a0fe08da7843f366dd9a2df3c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -45,6 +45,7 @@ intrinsics = [
      ['VGATHERPD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
      ['VGATHERPS',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
      ['VGATHERDD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+    ['VSCATTERPS',  ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'],
      ['VRCPPS',      ['a'], 'a'],
      ['VROUND',      ['a', 'rounding'], 'a'],
      ['BEXTR_32',    ['src', 'control'], 'src'],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp

index b3d0b706ac08961dc05fcfbbb10d72420119ef94..adf8924ce430450eb5a90688a472b1eef3504053 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -237,7 +237,8 @@ namespace SwrJit
          return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
      }
  
-    StoreInst* BuilderGfxMem::STORE(Value *Val, Value *Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
+    StoreInst*
+    BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
      {
          AssertGFXMemoryParams(Ptr, usage);
  
@@ -245,7 +246,11 @@ namespace SwrJit
          return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
      }
  
-    StoreInst* BuilderGfxMem::STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset, Type* Ty, JIT_MEM_CLIENT usage)
+    StoreInst* BuilderGfxMem::STORE(Value*                                 Val,
+                                    Value*                                 BasePtr,
+                                    const std::initializer_list<uint32_t>& offset,
+                                    Type*                                  Ty,
+                                    JIT_MEM_CLIENT                         usage)
      {
          AssertGFXMemoryParams(BasePtr, usage);
  
@@ -253,7 +258,8 @@ namespace SwrJit
          return Builder::STORE(Val, BasePtr, offset, Ty, usage);
      }
  
-    CallInst* BuilderGfxMem::MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty, JIT_MEM_CLIENT usage)
+    CallInst* BuilderGfxMem::MASKED_STORE(
+        Value* Val, Value* Ptr, unsigned Align, Value* Mask, Type* Ty, JIT_MEM_CLIENT usage)
      {
          AssertGFXMemoryParams(Ptr, usage);
  
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

index 90a0e03e197cae811183ed8a07bc430d31e12d29..267c5442d2a3bf7c5d215644f5f7c42c2c83fd0d 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -647,6 +647,10 @@ namespace SwrJit
      {
          AssertMemoryUsageParams(pDst, usage);
  
+        SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
+        VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
+        return;
+
          /* Scatter algorithm
  
          while(Index = BitScanForward(mask))
@@ -657,6 +661,10 @@ namespace SwrJit
  
          */
  
+        /*
+
+        // Reference implementation kept around for reference
+
          BasicBlock* pCurBB = IRB()->GetInsertBlock();
          Function*   pFunc  = pCurBB->getParent();
          Type*       pSrcTy = vSrc->getType()->getVectorElementType();
@@ -744,5 +752,7 @@ namespace SwrJit
  
          // Move builder to beginning of post loop
          IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
+
+        */
      }
  } // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

index c34959d35ee5fa27fba1f3c1e0bb5584fd50551a..2196aafb17b25213f1bc5bd3fc813bacf3620c81 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -32,8 +32,12 @@
  #include "passes.h"
  #include "JitManager.h"
  
+#include "common/simdlib.hpp"
+
  #include <unordered_map>
  
+extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
+
  namespace llvm
  {
      // foward declare the initializer
@@ -88,6 +92,8 @@ namespace SwrJit
      Instruction*
      VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
      Instruction*
+    VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction*
      VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
      Instruction*
      VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
@@ -102,88 +108,61 @@ namespace SwrJit
  
      static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
  
+    // clang-format off
      static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
-        //                              256 wide                                    512 wide
+        //                               256 wide                               512 wide
          {
              // AVX
-            {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
-            {"meta.intrinsic.VPERMPS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
-            {"meta.intrinsic.VPERMD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
-            {"meta.intrinsic.VGATHERPD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VGATHERPS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VGATHERDD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VCVTPD2PS",
-             {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
-            {"meta.intrinsic.VCVTPH2PS",
-             {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
-            {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
-            {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VRCPPS",    {{Intrinsic::x86_avx_rcp_ps_256,       DOUBLE},                    NO_EMU}},
+            {"meta.intrinsic.VPERMPS",   {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
+            {"meta.intrinsic.VPERMD",    {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
+            {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
+            {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
+            {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256,   Intrinsic::not_intrinsic},  NO_EMU}},
+            {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256,        Intrinsic::not_intrinsic},  NO_EMU}},
+            {"meta.intrinsic.VROUND",    {{Intrinsic::x86_avx_round_ps_256,     DOUBLE},                    NO_EMU}},
+            {"meta.intrinsic.VHSUBPS",   {{Intrinsic::x86_avx_hsub_ps_256,      DOUBLE},                    NO_EMU}},
          },
          {
              // AVX2
-            {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
-            {"meta.intrinsic.VPERMPS",
-             {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
-            {"meta.intrinsic.VPERMD",
-             {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
-            {"meta.intrinsic.VGATHERPD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VGATHERPS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VGATHERDD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
-            {"meta.intrinsic.VCVTPH2PS",
-             {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
-            {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
-            {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VRCPPS",       {{Intrinsic::x86_avx_rcp_ps_256,    DOUBLE},                    NO_EMU}},
+            {"meta.intrinsic.VPERMPS",      {{Intrinsic::x86_avx2_permps,       Intrinsic::not_intrinsic},  VPERM_EMU}},
+            {"meta.intrinsic.VPERMD",       {{Intrinsic::x86_avx2_permd,        Intrinsic::not_intrinsic},  VPERM_EMU}},
+            {"meta.intrinsic.VGATHERPD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPS",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERDD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
+            {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
+            {"meta.intrinsic.VCVTPD2PS",    {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE},                   NO_EMU}},
+            {"meta.intrinsic.VCVTPH2PS",    {{Intrinsic::x86_vcvtph2ps_256,     Intrinsic::not_intrinsic},  NO_EMU}},
+            {"meta.intrinsic.VROUND",       {{Intrinsic::x86_avx_round_ps_256,  DOUBLE},                    NO_EMU}},
+            {"meta.intrinsic.VHSUBPS",      {{Intrinsic::x86_avx_hsub_ps_256,   DOUBLE},                    NO_EMU}},
          },
          {
              // AVX512
-            {"meta.intrinsic.VRCPPS",
-             {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
+            {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256,     Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
  #if LLVM_VERSION_MAJOR < 7
-            {"meta.intrinsic.VPERMPS",
-             {{Intrinsic::x86_avx512_mask_permvar_sf_256,
-               Intrinsic::x86_avx512_mask_permvar_sf_512},
-              NO_EMU}},
-            {"meta.intrinsic.VPERMD",
-             {{Intrinsic::x86_avx512_mask_permvar_si_256,
-               Intrinsic::x86_avx512_mask_permvar_si_512},
-              NO_EMU}},
+            {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
+            {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
  #else
-            {"meta.intrinsic.VPERMPS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
-            {"meta.intrinsic.VPERMD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VPERM_EMU}},
  #endif
-            {"meta.intrinsic.VGATHERPD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VGATHERPS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
-            {"meta.intrinsic.VGATHERDD",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
  #if LLVM_VERSION_MAJOR < 7
-            {"meta.intrinsic.VCVTPD2PS",
-             {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
-              NO_EMU}},
+            {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
  #else
-            {"meta.intrinsic.VCVTPD2PS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
+            {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VCONVERT_EMU}},
  #endif
-            {"meta.intrinsic.VCVTPH2PS",
-             {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
-              NO_EMU}},
-            {"meta.intrinsic.VROUND",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
-            {"meta.intrinsic.VHSUBPS",
-             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
+            {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, NO_EMU}},
+            {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VROUND_EMU}},
+            {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VHSUB_EMU}},
          }};
+    // clang-format on
  
      struct LowerX86 : public FunctionPass
      {
@@ -209,6 +188,27 @@ namespace SwrJit
                  SWR_ASSERT(false, "Unsupported AVX architecture.");
                  mTarget = AVX;
              }
+
+            // Setup scatter function for 256 wide
+            uint32_t curWidth = B->mVWidth;
+            B->SetTargetWidth(8);
+            std::vector<Type*> args = {
+                B->mInt8PtrTy,   // pBase
+                B->mSimdInt32Ty, // vIndices
+                B->mSimdFP32Ty,  // vSrc
+                B->mInt8Ty,      // mask
+                B->mInt32Ty      // scale
+            };
+
+            FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
+            mPfnScatter256             = cast<Function>(
+                B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
+            {
+                sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
+            }
+
+            B->SetTargetWidth(curWidth);
          }
  
          // Try to decipher the vector type of the instruction. This does not work properly
@@ -392,23 +392,39 @@ namespace SwrJit
          virtual bool runOnFunction(Function& F)
          {
              std::vector<Instruction*> toRemove;
+            std::vector<BasicBlock*>  bbs;
+
+            // Make temp copy of the basic blocks and instructions, as the intrinsic
+            // replacement code might invalidate the iterators
+            for (auto& b : F.getBasicBlockList())
+            {
+                bbs.push_back(&b);
+            }
  
-            for (auto& BB : F.getBasicBlockList())
+            for (auto* BB : bbs)
              {
-                for (auto& I : BB.getInstList())
+                std::vector<Instruction*> insts;
+                for (auto& i : BB->getInstList())
                  {
-                    if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
+                    insts.push_back(&i);
+                }
+
+                for (auto* I : insts)
+                {
+                    if (CallInst* pCallInst = dyn_cast<CallInst>(I))
                      {
                          Function* pFunc = pCallInst->getCalledFunction();
                          if (pFunc)
                          {
                              if (pFunc->getName().startswith("meta.intrinsic"))
                              {
-                                B->IRB()->SetInsertPoint(&I);
+                                B->IRB()->SetInsertPoint(I);
                                  Instruction* pReplace = ProcessIntrinsic(pCallInst);
-                                SWR_ASSERT(pReplace);
                                  toRemove.push_back(pCallInst);
-                                pCallInst->replaceAllUsesWith(pReplace);
+                                if (pReplace)
+                                {
+                                    pCallInst->replaceAllUsesWith(pReplace);
+                                }
                              }
                          }
                      }
@@ -428,10 +444,9 @@ namespace SwrJit
          virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
  
          JitManager* JM() { return B->JM(); }
-
-        Builder* B;
-
-        TargetArch mTarget;
+        Builder*    B;
+        TargetArch  mTarget;
+        Function*   mPfnScatter256;
  
          static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
      };
@@ -639,6 +654,69 @@ namespace SwrJit
  
          return cast<Instruction>(v32Gather);
      }
+    Instruction*
+    VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    {
+        Builder* B           = pThis->B;
+        auto     pBase       = pCallInst->getArgOperand(0);
+        auto     vi1Mask     = pCallInst->getArgOperand(1);
+        auto     vi32Indices = pCallInst->getArgOperand(2);
+        auto     v32Src      = pCallInst->getArgOperand(3);
+        auto     i32Scale    = pCallInst->getArgOperand(4);
+
+        if (arch != AVX512)
+        {
+            // Call into C function to do the scatter. This has significantly better compile perf
+            // compared to jitting scatter loops for every scatter
+            if (width == W256)
+            {
+                auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
+                B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
+            }
+            else
+            {
+                // Need to break up 512 wide scatter to two 256 wide
+                auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
+                auto indicesLo =
+                    B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
+                auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
+
+                auto mask = B->BITCAST(maskLo, B->mInt8Ty);
+                B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
+
+                auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
+                auto indicesHi =
+                    B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
+                auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
+
+                mask = B->BITCAST(maskHi, B->mInt8Ty);
+                B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
+            }
+            return nullptr;
+        }
+
+        Value*    iMask;
+        Function* pX86IntrinFunc;
+        if (width == W256)
+        {
+            // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
+            // can use the scatter of 8 elements with 64bit indices
+            pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                       Intrinsic::x86_avx512_scatter_qps_512);
+
+            auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
+            iMask               = B->BITCAST(vi1Mask, B->mInt8Ty);
+            B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
+        }
+        else if (width == W512)
+        {
+            pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                       Intrinsic::x86_avx512_scatter_dps_512);
+            iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
+            B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
+        }
+        return nullptr;
+    }
  
      // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
      // instructions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp

new file mode 100644 (file)

index 0000000..de81154
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp
@@ -0,0 +1,49 @@
+/****************************************************************************
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file Scatter.cpp
+ *
+ * @brief Shader support library implementation for scatter emulation
+ *
+ * Notes:
+ *
+ ******************************************************************************/
+#include <stdarg.h>
+#include "common/os.h"
+#include "common/simdlib.hpp"
+
+extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256::Float vSrc, uint8_t mask, uint32_t scale)
+{
+    OSALIGN(float, 32) src[8];
+    OSALIGN(uint32_t, 32) indices[8];
+
+    SIMD256::store_ps(src, vSrc);
+    SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
+
+    DWORD index;
+    while (_BitScanForward(&index, mask))
+    {
+        mask &= ~(1 << index);
+
+        *(float*)(pBase + indices[index] * scale) = src[index];
+    }
+}
author	Jan Zielinski <jan.zielinski@intel.com>
	Wed, 24 Jul 2019 10:25:27 +0000 (12:25 +0200)
committer	Jan Zielinski <jan.zielinski@intel.com>
	Tue, 30 Jul 2019 13:39:19 +0000 (13:39 +0000)
src/gallium/drivers/swr/Makefile.sources		patch \| blob \| history
src/gallium/drivers/swr/meson.build		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp	[new file with mode: 0644]	patch \| blob