gallivm: introduce 32x32->64bit lp_build_mul_32_lohi function
authorRoland Scheidegger <sroland@vmware.com>
Fri, 4 Nov 2016 03:55:09 +0000 (04:55 +0100)
committerRoland Scheidegger <sroland@vmware.com>
Tue, 8 Nov 2016 02:41:26 +0000 (03:41 +0100)
This is used by shader umul_hi/imul_hi functions (and soon by draw).
It's actually useful separating this out on its own, however the real
reason for doing it is because we're using an optimized sse2 version,
since the code llvm generates is atrocious (since there's no widening
mul in llvm, and it does not recognize the widening mul pattern, so
it generates code for real 64x64->64bit mul, which the cpu can't do
natively, in contrast to 32x32->64bit mul which it could do).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
src/gallium/auxiliary/gallivm/lp_bld_arit.c
src/gallium/auxiliary/gallivm/lp_bld_arit.h
src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c

index 3ea0734331d0b71e9decef2e164659e4009346bb..3de46287f705691580c9637a149bb3d5ec1b4a72 100644 (file)
@@ -1091,6 +1091,156 @@ lp_build_mul(struct lp_build_context *bld,
    return res;
 }
 
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+
+   /*
+    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+    * for x86 simd is atrocious (even if the high bits weren't required),
+    * trying to handle real 64bit inputs (which of course can't happen due
+    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+    * apparently llvm does not recognize this widening mul). This includes 6
+    * (instead of 2) pmuludq plus extra adds and shifts
+    * The same story applies to signed mul, albeit fixing this requires sse41.
+    * https://llvm.org/bugs/show_bug.cgi?id=30845
+    * So, whip up our own code, albeit only for length 4 and 8 (which
+    * should be good enough)...
+    */
+   if ((bld->type.length == 4 || bld->type.length == 8) &&
+       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+        util_cpu_caps.has_sse4_1)) {
+      const char *intrinsic = NULL;
+      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+      struct lp_type type_wide = lp_wider_type(bld->type);
+      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+      unsigned i;
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i+1);
+         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      aeven = a;
+      beven = b;
+      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.avx2.pmul.dq";
+         } else {
+            intrinsic = "llvm.x86.avx2.pmulu.dq";
+         }
+         muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                             wider_type, aeven, beven);
+         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                            wider_type, aodd, bodd);
+      }
+      else {
+         /* for consistent naming look elsewhere... */
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.sse41.pmuldq";
+         } else {
+            intrinsic = "llvm.x86.sse2.pmulu.dq";
+         }
+         /*
+          * XXX If we only have AVX but not AVX2 this is a pain.
+          * lp_build_intrinsic_binary_anylength() can't handle it
+          * (due to src and dst type not being identical).
+          */
+         if (bld->type.length == 8) {
+            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+            LLVMValueRef muleven2[2], mulodd2[2];
+            struct lp_type type_wide_half = type_wide;
+            LLVMTypeRef wtype_half;
+            type_wide_half.length = 2;
+            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
+            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
+            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
+            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenlo, bevenlo);
+            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddlo, boddlo);
+            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenhi, bevenhi);
+            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddhi, boddhi);
+            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
+            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
+
+         }
+         else {
+            muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                                wider_type, aeven, beven);
+            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                               wider_type, aodd, bodd);
+         }
+      }
+      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
+      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i + 1);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+   }
+   else {
+      LLVMValueRef tmp;
+      struct lp_type type_tmp;
+      LLVMTypeRef wide_type, cast_type;
+
+      type_tmp = bld->type;
+      type_tmp.width *= 2;
+      wide_type = lp_build_vec_type(gallivm, type_tmp);
+      type_tmp = bld->type;
+      type_tmp.length *= 2;
+      cast_type = lp_build_vec_type(gallivm, type_tmp);
+
+      if (bld->type.sign) {
+         a = LLVMBuildSExt(builder, a, wide_type, "");
+         b = LLVMBuildSExt(builder, b, wide_type, "");
+      } else {
+         a = LLVMBuildZExt(builder, a, wide_type, "");
+         b = LLVMBuildZExt(builder, b, wide_type, "");
+      }
+      tmp = LLVMBuildMul(builder, a, b, "");
+      tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
+      *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
+      return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
+   }
+}
+
 
 /* a * b + c */
 LLVMValueRef
index 622b930a937931c2480e3dff6a059210bce3a844..5d48b1c837dd25729297ced9e5b80287bfdc6b75 100644 (file)
@@ -76,6 +76,12 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
 
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi);
+
 LLVMValueRef
 lp_build_mul_imm(struct lp_build_context *bld,
                  LLVMValueRef a,
index 2e837afe290da5e02fc01c7991f13ac1db379ae6..72d4579a042a1498d72863154053087419f3f316 100644 (file)
@@ -842,26 +842,15 @@ imul_hi_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    struct lp_build_context *int_bld = &bld_base->int_bld;
-   struct lp_type type = int_bld->type;
-   LLVMValueRef src0, src1;
-   LLVMValueRef dst64;
-   LLVMTypeRef typeRef;
-
-   assert(type.width == 32);
-   type.width = 64;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   src0 = LLVMBuildSExt(builder, emit_data->args[0], typeRef, "");
-   src1 = LLVMBuildSExt(builder, emit_data->args[1], typeRef, "");
-   dst64 = LLVMBuildMul(builder, src0, src1, "");
-   dst64 = LLVMBuildAShr(
-            builder, dst64,
-            lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
-   type.width = 32;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   emit_data->output[emit_data->chan] =
-         LLVMBuildTrunc(builder, dst64, typeRef, "");
+   LLVMValueRef hi_bits;
+
+   assert(int_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi(int_bld, emit_data->args[0],
+                        emit_data->args[0], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
 }
 
 /* TGSI_OPCODE_UMUL_HI */
@@ -871,26 +860,15 @@ umul_hi_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   struct lp_type type = uint_bld->type;
-   LLVMValueRef src0, src1;
-   LLVMValueRef dst64;
-   LLVMTypeRef typeRef;
-
-   assert(type.width == 32);
-   type.width = 64;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   src0 = LLVMBuildZExt(builder, emit_data->args[0], typeRef, "");
-   src1 = LLVMBuildZExt(builder, emit_data->args[1], typeRef, "");
-   dst64 = LLVMBuildMul(builder, src0, src1, "");
-   dst64 = LLVMBuildLShr(
-            builder, dst64,
-            lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
-   type.width = 32;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   emit_data->output[emit_data->chan] =
-         LLVMBuildTrunc(builder, dst64, typeRef, "");
+   LLVMValueRef hi_bits;
+
+   assert(uint_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
+                        emit_data->args[0], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
 }
 
 /* TGSI_OPCODE_MAX */