swr/rast: Replace x86 VMOVMSK with llvm-only implementation
authorGeorge Kyriazis <george.kyriazis@intel.com>
Tue, 10 Apr 2018 17:03:41 +0000 (12:03 -0500)
committerGeorge Kyriazis <george.kyriazis@intel.com>
Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

index 9c1e9e0ac8f6bfb5a679111a00d0cf4d4ec2f856..bced6576443979f7ec8910bd6e37d4951dff3851 100644 (file)
@@ -58,7 +58,6 @@ intrinsics = [
     ['VPTESTC',     ['a', 'b'], 'mInt32Ty'],
     ['VPTESTZ',     ['a', 'b'], 'mInt32Ty'],
     ['VFMADDPS',    ['a', 'b', 'c'], 'a'],
-    ['VMOVMSKPS',   ['a'], 'mInt32Ty'],
     ['VPHADDD',     ['a', 'b'], 'a'],
     ['PDEP32',      ['a', 'b'], 'a'],
     ['RDTSC',       [], 'mInt64Ty'],
index f0cd4413d3e03f0cb0ac84f875d2215f5456f9e4..5b70b29afbabde76081a2055c3a4a25714b6305f 100644 (file)
@@ -608,7 +608,7 @@ namespace SwrJit
         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 
-        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+        Value* pMask = VMOVMSK(vMask);
 
         // Setup loop basic block
         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
index aa9e2dddee8668182ebfd9af154d4c949c817ce4..f8936930b7e53a0aebc9dbf357958cd0e8693259 100644 (file)
@@ -525,6 +525,28 @@ namespace SwrJit
         return S_EXT(mask, mSimd16Int32Ty);
     }
 
+    /// @brief Convert <Nxi1> llvm mask to integer
+    Value *Builder::VMOVMSK(Value* mask)
+    {
+        SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
+        uint32_t numLanes = mask->getType()->getVectorNumElements();
+        Value* i32Result;
+        if (numLanes == 8)
+        {
+            i32Result = BITCAST(mask, mInt8Ty);
+        }
+        else if (numLanes == 16)
+        {
+            i32Result = BITCAST(mask, mInt16Ty);
+        }
+        else
+        {
+            SWR_ASSERT("Unsupported vector width");
+            i32Result = BITCAST(mask, mInt8Ty);
+        }
+        return Z_EXT(i32Result, mInt32Ty);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
     /// supported on the underlying platform, emulate it
@@ -768,8 +790,7 @@ namespace SwrJit
     /// @brief pop count on vector mask (e.g. <8 x i1>)
     Value* Builder::VPOPCNT(Value* a)
     {
-        Value* b = BITCAST(VMASK(a), mSimdFP32Ty);
-        return POPCNT(VMOVMSKPS(b));
+        return POPCNT(VMOVMSK(a));
     }
 
     //////////////////////////////////////////////////////////////////////////
index 7308821c89eda91bbbd2e65f369eebb55b990975..bd4be9ffe2a91ef05c6a789d76ba5db2bbe2c279 100644 (file)
@@ -102,6 +102,8 @@ Value *MASK_16(Value *vmask);
 Value *VMASK(Value *mask);
 Value *VMASK_16(Value *mask);
 
+Value *VMOVMSK(Value *mask);
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief functions that build IR to call x86 intrinsics directly, or
 /// emulate them with other instructions if not available on the host
index 7cfa77249802d57811c2d49f481a93966194c372..856d67d2bc980b8ae76e75acd9f53fc1a04af934 100644 (file)
@@ -79,7 +79,6 @@ namespace SwrJit
         {"meta.intrinsic.VPTESTC",         Intrinsic::x86_avx_ptestc_256},
         {"meta.intrinsic.VPTESTZ",         Intrinsic::x86_avx_ptestz_256},
         {"meta.intrinsic.VFMADDPS",        Intrinsic::x86_fma_vfmadd_ps_256},
-        {"meta.intrinsic.VMOVMSKPS",       Intrinsic::x86_avx_movmsk_ps_256},
         {"meta.intrinsic.VPHADDD",         Intrinsic::x86_avx2_phadd_d},
         {"meta.intrinsic.PDEP32",          Intrinsic::x86_bmi_pdep_32},
         {"meta.intrinsic.RDTSC",           Intrinsic::x86_rdtsc},