swr/rast: Lower PERMD and PERMPS to x86.

author George Kyriazis <george.kyriazis@intel.com>

Tue, 20 Mar 2018 23:13:35 +0000 (18:13 -0500)

committer George Kyriazis <george.kyriazis@intel.com>

Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
author George Kyriazis <george.kyriazis@intel.com>
Tue, 20 Mar 2018 23:13:35 +0000 (18:13 -0500)
committer George Kyriazis <george.kyriazis@intel.com>
Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

index 68695c46c8148198bec75a8ac0b4071de4eb9b37..d8ec885308abc28ca60983694c223590ae7e566b 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -555,7 +555,7 @@ namespace SwrJit
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
  
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
              // after PERMD: move and pack xy components into each 128bit lane
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -565,7 +565,7 @@ namespace SwrJit
              if (info.numComps > 2)
              {
                  Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+                vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
              }
  
              for (uint32_t i = 0; i < 4; i++)
@@ -644,7 +644,7 @@ namespace SwrJit
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
  
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
              // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
@@ -653,7 +653,7 @@ namespace SwrJit
              Value* vi128ZW = nullptr;
              if (info.numComps > 2)
              {
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+                vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
              }
  
              // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

index 54987c77246f49fd45cf3a47f4b3abc0269382b5..aa9e2dddee8668182ebfd9af154d4c949c817ce4 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -601,76 +601,6 @@ namespace SwrJit
          return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
-    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-    /// platform, emulate it
-    /// @param a - 256bit SIMD lane(8x32bit) of integer values.
-    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-    Value *Builder::PERMD(Value* a, Value* idx)
-    {
-        Value* res;
-        // use avx2 permute instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            res = VPERMD(a, idx);
-        }
-        else
-        {
-            if (isa<Constant>(idx))
-            {
-                res = VSHUFFLE(a, a, idx);
-            }
-            else
-            {
-                res = VUNDEF_I();
-                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-                {
-                    Value* pIndex = VEXTRACT(idx, C(l));
-                    Value* pVal = VEXTRACT(a, pIndex);
-                    res = VINSERT(res, pVal, C(l));
-                }
-            }
-        }
-        return res;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
-    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-    /// platform, emulate it
-    /// @param a - 256bit SIMD lane(8x32bit) of float values.
-    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-    Value *Builder::PERMPS(Value* a, Value* idx)
-    {
-        Value* res;
-        // use avx2 permute instruction if available
-        if (JM()->mArch.AVX2())
-        {
-            // llvm 3.6.0 swapped the order of the args to vpermd
-            res = VPERMPS(idx, a);
-        }
-        else
-        {
-            if (isa<Constant>(idx))
-            {
-                res = VSHUFFLE(a, a, idx);
-            }
-            else
-            {
-                res = VUNDEF_F();
-                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-                {
-                    Value* pIndex = VEXTRACT(idx, C(l));
-                    Value* pVal = VEXTRACT(a, pIndex);
-                    res = VINSERT(res, pVal, C(l));
-                }
-            }
-        }
-
-        return res;
-    }
-
      //////////////////////////////////////////////////////////////////////////
      /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
      /// in LLVM IR.  If not supported on the underlying platform, emulate it
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h

index 343a9b0367032758925bdca966b029abec92e4c1..7308821c89eda91bbbd2e65f369eebb55b990975 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -113,8 +113,6 @@ Value *JOIN_16(Value *a, Value *b);
  Value *PSHUFB(Value* a, Value* b);
  Value *PMOVSXBD(Value* a);
  Value *PMOVSXWD(Value* a);
-Value *PERMD(Value* a, Value* idx);
-Value *PERMPS(Value* a, Value* idx);
  Value *CVTPH2PS(Value* a, const llvm::Twine& name = "");
  Value *CVTPS2PH(Value* a, Value* rounding);
  Value *PMAXSD(Value* a, Value* b);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

index f9293aa3b4b13959c313c72eef4db4525c580140..da6d982d0e3bd771bb1f4f2bfdf0424d1930197c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1955,8 +1955,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
          Value *vi128XY_hi = nullptr;
          if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
          {
-            vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
-            vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
  
              // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
              // 256i - 0    1    2    3    4    5    6    7
@@ -1968,8 +1968,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
          Value *vi128ZW_hi = nullptr;
          if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
          {
-            vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
-            vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
          }
  
          // init denormalize variables if needed
@@ -2306,8 +2306,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
  
-            vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-            vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
  
              // after PERMD: move and pack xy components into each 128bit lane
              // 256i - 0    1    2    3    4    5    6    7
@@ -2325,8 +2325,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
              Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
              Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
  
-            vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-            vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
          }
  
          // init denormalize variables if needed
@@ -2547,7 +2547,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
  
-            vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
              // after PERMD: move and pack xy components into each 128bit lane
              // 256i - 0    1    2    3    4    5    6    7
              //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -2557,7 +2557,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
          Value* vi128ZW = nullptr;
          if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
              Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
-            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
          }
  
          // init denormalize variables if needed
author	George Kyriazis <george.kyriazis@intel.com>
	Tue, 20 Mar 2018 23:13:35 +0000 (18:13 -0500)
committer	George Kyriazis <george.kyriazis@intel.com>
	Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp		patch \| blob \| history