swr/rast: add cvt instructions in x86 lowering pass
authorGeorge Kyriazis <george.kyriazis@intel.com>
Thu, 5 Apr 2018 22:51:02 +0000 (17:51 -0500)
committerGeorge Kyriazis <george.kyriazis@intel.com>
Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
Support generic VCVTPD2PS and VCVTPH2PS in x86 lowering pass.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

index 2636e60ae9aa91902c175f9da121da15694ae3fa..4a7d2e9a543969239f5765027973751328335255 100644 (file)
@@ -42,28 +42,26 @@ inst_aliases = {
 }
 
 intrinsics = [
-    ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd4FP64Ty'],
-    ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdFP32Ty'],
-    ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16FP32Ty'],
-    ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdInt32Ty'],
-    ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16Int32Ty'],
-    ['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'],
-    ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'],
-    ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'],
-    ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'],
-    ['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'],
-    ['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'],
-    ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'],
-    ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'],
-    ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'],
-    ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'],
-    ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'],
-    ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'],
-    ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'],
-    ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'],
-    ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'],
-    ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'],
-    ['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'],
+    ['VGATHERPD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+    ['VGATHERPS',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+    ['VGATHERDD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
+    ['VRCPPS',      ['a'], 'a'],
+    ['VROUND',      ['a', 'rounding'], 'a'],
+    ['BEXTR_32',    ['src', 'control'], 'src'],
+    ['VPSHUFB',     ['a', 'b'], 'a'],
+    ['VPERMD',      ['a', 'idx'], 'a'],
+    ['VPERMPS',     ['idx', 'a'], 'a'],
+    ['VCVTPD2PS',   ['a'], 'VectorType::get(mFP32Ty, a->getType()->getVectorNumElements())'],
+    ['VCVTPH2PS',   ['a'], 'VectorType::get(mFP32Ty, a->getType()->getVectorNumElements())'],
+    ['VCVTPS2PH',   ['a', 'round'], 'mSimdFP16Ty'],
+    ['VHSUBPS',     ['a', 'b'], 'a'],
+    ['VPTESTC',     ['a', 'b'], 'mInt32Ty'],
+    ['VPTESTZ',     ['a', 'b'], 'mInt32Ty'],
+    ['VFMADDPS',    ['a', 'b', 'c'], 'a'],
+    ['VMOVMSKPS',   ['a'], 'mInt32Ty'],
+    ['VPHADDD',     ['a', 'b'], 'a'],
+    ['PDEP32',      ['a', 'b'], 'a'],
+    ['RDTSC',       [], 'mInt64Ty'],
 ]
 
 llvm_intrinsics = [
@@ -231,19 +229,31 @@ def generate_meta_h(output_dir):
 
     functions = []
     for inst in intrinsics:
+        name = inst[0]
+        args = inst[1]
+        ret = inst[2]
+
         #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
-        if len(inst[2]) != 0:
-            declargs = 'Value* ' + ', Value* '.join(inst[2])
-            decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs)
+        if len(args) != 0:
+            declargs = 'Value* ' + ', Value* '.join(args)
+            decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs)
         else:
-            decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
+            decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
+
+        # determine the return type of the intrinsic. It can either be:
+        # - type of one of the input arguments
+        # - snippet of code to set the return type
+        
+        if ret in args:
+            returnTy = ret + '->getType()'
+        else:
+            returnTy = ret
 
         functions.append({
             'decl'      : decl,
-            'name'      : inst[0],
-            'intrin'    : inst[1],
-            'args'      : inst[2],
-            'returnType': inst[3]
+            'name'      : name,
+            'args'      : args,
+            'returnType': returnTy
         })
 
     MakoTemplateWriter.to_file(
index eccf0ad0d6d4922a4d63b0cc469d139d02e4e128..c7912785b7b6912432f06a4a1ae6a98b931156d1 100644 (file)
@@ -176,13 +176,6 @@ namespace SwrJit
         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
     }
 
-    Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(pBase, usage);
-
-        return VGATHERPS_16(vSrc, pBase, vIndices, vMask, C(scale));
-    }
-
     //////////////////////////////////////////////////////////////////////////
     /// @brief Generate a masked gather operation in LLVM IR.  If not  
     /// supported on the underlying platform, emulate it with loads
@@ -198,13 +191,6 @@ namespace SwrJit
         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
     }
 
-    Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
-    {
-        AssertMemoryUsageParams(pBase, usage);
-
-        return VGATHERDD_16(vSrc, pBase, vIndices, vMask, C(scale));
-    }
-
     //////////////////////////////////////////////////////////////////////////
     /// @brief Generate a masked gather operation in LLVM IR.  If not
     /// supported on the underlying platform, emulate it with loads
index f229da38a9404a64ff396e1ae01bafd0babd30bb..9ccac4f80024ca3224f33704271eb05e30c1f853 100644 (file)
@@ -78,13 +78,10 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 
 virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-
 void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
     Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
 virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
 void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
     Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
index 0fbfd211ef7b176d689c2dcd6adcd7d2ecc672bd..8d659d08e00bfce38254c06d086ebed6abe8df33 100644 (file)
@@ -1563,10 +1563,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
                     if (bFP)
                     {
                         // extract 128 bit lanes to sign extend each component
-                        /// @todo Force 8-wide cvt until we support generic cvt in x86 lowering pass
-                        Function* pCvtPh2Ps = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256);
-                        Value *temp_lo = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
-                        Value *temp_hi = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+                        Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+                        Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
 
                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
                     }
index 11a2397c43dc5f1dadd66d3af1a47fec32ead1e4..b27335f0601a9fc59f20354aa00310d51bb18bd6 100644 (file)
@@ -76,8 +76,6 @@ namespace SwrJit
         {"meta.intrinsic.VROUND",          Intrinsic::x86_avx_round_ps_256},
         {"meta.intrinsic.BEXTR_32",        Intrinsic::x86_bmi_bextr_32},
         {"meta.intrinsic.VPSHUFB",         Intrinsic::x86_avx2_pshuf_b},
-        {"meta.intrinsic.VCVTPD2PS",       Intrinsic::x86_avx_cvt_pd2_ps_256},
-        {"meta.intrinsic.VCVTPH2PS",       Intrinsic::x86_vcvtph2ps_256},
         {"meta.intrinsic.VCVTPS2PH",       Intrinsic::x86_vcvtps2ph_256},
         {"meta.intrinsic.VHSUBPS",         Intrinsic::x86_avx_hsub_ps_256},
         {"meta.intrinsic.VPTESTC",         Intrinsic::x86_avx_ptestc_256},
@@ -101,27 +99,27 @@ namespace SwrJit
         {"meta.intrinsic.VPERMPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
         {"meta.intrinsic.VPERMD",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,          Intrinsic::not_intrinsic},                      NO_EMU}},
+        {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256,               Intrinsic::not_intrinsic},                      NO_EMU}},
     },
     {   // AVX2
         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              Intrinsic::not_intrinsic},                      NO_EMU}},
         {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx2_permps,                 Intrinsic::not_intrinsic},                      VPERM_EMU}},
         {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx2_permd,                  Intrinsic::not_intrinsic},                      VPERM_EMU}},
         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,          Intrinsic::not_intrinsic},                      NO_EMU}},
+        {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256,               Intrinsic::not_intrinsic},                      NO_EMU}},
     },
     {   // AVX512
         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx512_rcp14_ps_256,         Intrinsic::x86_avx512_rcp14_ps_512},            NO_EMU}},
         {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx512_mask_permvar_sf_256,  Intrinsic::x86_avx512_mask_permvar_sf_512},     NO_EMU}},
         {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx512_mask_permvar_si_256,  Intrinsic::x86_avx512_mask_permvar_si_512},     NO_EMU}},
         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
+        {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx512_mask_cvtpd2ps_256,    Intrinsic::x86_avx512_mask_cvtpd2ps_512 },      NO_EMU}},
+        {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_avx512_mask_vcvtph2ps_256,   Intrinsic::x86_avx512_mask_vcvtph2ps_512 },     NO_EMU}},
     }
     };