panfrost: Specialize compute vs frag shader init

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_logic.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c

index f56b61bf24891212624e4943a8cdef9838c5cec1..315977ae7450e619d6d6ff9d2c0ddc200f7adcb2 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -32,6 +32,7 @@
   * @author Jose Fonseca <jfonseca@vmware.com>
   */
  
+#include <llvm/Config/llvm-config.h>
  
  #include "util/u_cpu_detect.h"
  #include "util/u_memory.h"
@@ -39,6 +40,7 @@
  
  #include "lp_bld_type.h"
  #include "lp_bld_const.h"
+#include "lp_bld_swizzle.h"
  #include "lp_bld_init.h"
  #include "lp_bld_intr.h"
  #include "lp_bld_debug.h"
@@ -68,14 +70,17 @@
  /**
   * Build code to compare two values 'a' and 'b' of 'type' using the given func.
   * \param func  one of PIPE_FUNC_x
+ * If the ordered argument is true the function will use LLVM's ordered
+ * comparisons, otherwise unordered comparisons will be used.
   * The result values will be 0 for false or ~0 for true.
   */
-LLVMValueRef
-lp_build_compare(struct gallivm_state *gallivm,
-                 const struct lp_type type,
-                 unsigned func,
-                 LLVMValueRef a,
-                 LLVMValueRef b)
+static LLVMValueRef
+lp_build_compare_ext(struct gallivm_state *gallivm,
+                     const struct lp_type type,
+                     unsigned func,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     boolean ordered)
  {
     LLVMBuilderRef builder = gallivm->builder;
     LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
@@ -84,8 +89,6 @@ lp_build_compare(struct gallivm_state *gallivm,
     LLVMValueRef cond;
     LLVMValueRef res;
  
-   assert(func >= PIPE_FUNC_NEVER);
-   assert(func <= PIPE_FUNC_ALWAYS);
     assert(lp_check_value(type, a));
     assert(lp_check_value(type, b));
  
@@ -94,216 +97,37 @@ lp_build_compare(struct gallivm_state *gallivm,
     if(func == PIPE_FUNC_ALWAYS)
        return ones;
  
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   /*
-    * There are no unsigned integer comparison instructions in SSE.
-    */
-
-   if (!type.floating && !type.sign &&
-       type.width * type.length == 128 &&
-       util_cpu_caps.has_sse2 &&
-       (func == PIPE_FUNC_LESS ||
-        func == PIPE_FUNC_LEQUAL ||
-        func == PIPE_FUNC_GREATER ||
-        func == PIPE_FUNC_GEQUAL) &&
-       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
-         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
-                      __FUNCTION__, type.length, type.width);
-   }
-#endif
-
-#if HAVE_LLVM < 0x0207
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width * type.length == 128) {
-      if(type.floating && util_cpu_caps.has_sse) {
-         /* float[4] comparison */
-         LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
-         LLVMValueRef args[3];
-         unsigned cc;
-         boolean swap;
-
-         swap = FALSE;
-         switch(func) {
-         case PIPE_FUNC_EQUAL:
-            cc = 0;
-            break;
-         case PIPE_FUNC_NOTEQUAL:
-            cc = 4;
-            break;
-         case PIPE_FUNC_LESS:
-            cc = 1;
-            break;
-         case PIPE_FUNC_LEQUAL:
-            cc = 2;
-            break;
-         case PIPE_FUNC_GREATER:
-            cc = 1;
-            swap = TRUE;
-            break;
-         case PIPE_FUNC_GEQUAL:
-            cc = 2;
-            swap = TRUE;
-            break;
-         default:
-            assert(0);
-            return lp_build_undef(gallivm, type);
-         }
-
-         if(swap) {
-            args[0] = b;
-            args[1] = a;
-         }
-         else {
-            args[0] = a;
-            args[1] = b;
-         }
-
-         args[2] = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), cc, 0);
-         res = lp_build_intrinsic(builder,
-                                  "llvm.x86.sse.cmp.ps",
-                                  vec_type,
-                                  args, 3);
-         res = LLVMBuildBitCast(builder, res, int_vec_type, "");
-         return res;
-      }
-      else if(util_cpu_caps.has_sse2) {
-         /* int[4] comparison */
-         static const struct {
-            unsigned swap:1;
-            unsigned eq:1;
-            unsigned gt:1;
-            unsigned not:1;
-         } table[] = {
-            {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */
-            {1, 0, 1, 0}, /* PIPE_FUNC_LESS */
-            {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */
-            {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */
-            {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */
-            {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */
-            {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */
-            {0, 0, 0, 0}  /* PIPE_FUNC_ALWAYS */
-         };
-         const char *pcmpeq;
-         const char *pcmpgt;
-         LLVMValueRef args[2];
-         LLVMValueRef res;
-         LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
-
-         switch (type.width) {
-         case 8:
-            pcmpeq = "llvm.x86.sse2.pcmpeq.b";
-            pcmpgt = "llvm.x86.sse2.pcmpgt.b";
-            break;
-         case 16:
-            pcmpeq = "llvm.x86.sse2.pcmpeq.w";
-            pcmpgt = "llvm.x86.sse2.pcmpgt.w";
-            break;
-         case 32:
-            pcmpeq = "llvm.x86.sse2.pcmpeq.d";
-            pcmpgt = "llvm.x86.sse2.pcmpgt.d";
-            break;
-         default:
-            assert(0);
-            return lp_build_undef(gallivm, type);
-         }
-
-         /* There are no unsigned comparison instructions. So flip the sign bit
-          * so that the results match.
-          */
-         if (table[func].gt && !type.sign) {
-            LLVMValueRef msb = lp_build_const_int_vec(gallivm, type, (unsigned long long)1 << (type.width - 1));
-            a = LLVMBuildXor(builder, a, msb, "");
-            b = LLVMBuildXor(builder, b, msb, "");
-         }
-
-         if(table[func].swap) {
-            args[0] = b;
-            args[1] = a;
-         }
-         else {
-            args[0] = a;
-            args[1] = b;
-         }
-
-         if(table[func].eq)
-            res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2);
-         else if (table[func].gt)
-            res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2);
-         else
-            res = LLVMConstNull(vec_type);
-
-         if(table[func].not)
-            res = LLVMBuildNot(builder, res, "");
-
-         return res;
-      }
-   } /* if (type.width * type.length == 128) */
-#endif
-#endif /* HAVE_LLVM < 0x0207 */
-
-   /* XXX: It is not clear if we should use the ordered or unordered operators */
+   assert(func > PIPE_FUNC_NEVER);
+   assert(func < PIPE_FUNC_ALWAYS);
  
     if(type.floating) {
        LLVMRealPredicate op;
        switch(func) {
-      case PIPE_FUNC_NEVER:
-         op = LLVMRealPredicateFalse;
-         break;
-      case PIPE_FUNC_ALWAYS:
-         op = LLVMRealPredicateTrue;
-         break;
        case PIPE_FUNC_EQUAL:
-         op = LLVMRealUEQ;
+         op = ordered ? LLVMRealOEQ : LLVMRealUEQ;
           break;
        case PIPE_FUNC_NOTEQUAL:
-         op = LLVMRealUNE;
+         op = ordered ? LLVMRealONE : LLVMRealUNE;
           break;
        case PIPE_FUNC_LESS:
-         op = LLVMRealULT;
+         op = ordered ? LLVMRealOLT : LLVMRealULT;
           break;
        case PIPE_FUNC_LEQUAL:
-         op = LLVMRealULE;
+         op = ordered ? LLVMRealOLE : LLVMRealULE;
           break;
        case PIPE_FUNC_GREATER:
-         op = LLVMRealUGT;
+         op = ordered ? LLVMRealOGT : LLVMRealUGT;
           break;
        case PIPE_FUNC_GEQUAL:
-         op = LLVMRealUGE;
+         op = ordered ? LLVMRealOGE : LLVMRealUGE;
           break;
        default:
           assert(0);
           return lp_build_undef(gallivm, type);
        }
  
-#if HAVE_LLVM >= 0x0207
        cond = LLVMBuildFCmp(builder, op, a, b, "");
        res = LLVMBuildSExt(builder, cond, int_vec_type, "");
-#else
-      if (type.length == 1) {
-         cond = LLVMBuildFCmp(builder, op, a, b, "");
-         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
-      }
-      else {
-         unsigned i;
-
-         res = LLVMGetUndef(int_vec_type);
-
-         debug_printf("%s: warning: using slow element-wise float"
-                      " vector comparison\n", __FUNCTION__);
-         for (i = 0; i < type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            cond = LLVMBuildFCmp(builder, op,
-                                 LLVMBuildExtractElement(builder, a, index, ""),
-                                 LLVMBuildExtractElement(builder, b, index, ""),
-                                 "");
-            cond = LLVMBuildSelect(builder, cond,
-                                   LLVMConstExtractElement(ones, index),
-                                   LLVMConstExtractElement(zeros, index),
-                                   "");
-            res = LLVMBuildInsertElement(builder, res, cond, index, "");
-         }
-      }
-#endif
     }
     else {
        LLVMIntPredicate op;
@@ -331,48 +155,84 @@ lp_build_compare(struct gallivm_state *gallivm,
           return lp_build_undef(gallivm, type);
        }
  
-#if HAVE_LLVM >= 0x0207
        cond = LLVMBuildICmp(builder, op, a, b, "");
        res = LLVMBuildSExt(builder, cond, int_vec_type, "");
-#else
-      if (type.length == 1) {
-         cond = LLVMBuildICmp(builder, op, a, b, "");
-         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
-      }
-      else {
-         unsigned i;
+   }
  
-         res = LLVMGetUndef(int_vec_type);
+   return res;
+}
  
-         if (gallivm_debug & GALLIVM_DEBUG_PERF) {
-            debug_printf("%s: using slow element-wise int"
-                         " vector comparison\n", __FUNCTION__);
-         }
+/**
+ * Build code to compare two values 'a' and 'b' of 'type' using the given func.
+ * \param func  one of PIPE_FUNC_x
+ * The result values will be 0 for false or ~0 for true.
+ */
+LLVMValueRef
+lp_build_compare(struct gallivm_state *gallivm,
+                 const struct lp_type type,
+                 unsigned func,
+                 LLVMValueRef a,
+                 LLVMValueRef b)
+{
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
+   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
+   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
  
-         for(i = 0; i < type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            cond = LLVMBuildICmp(builder, op,
-                                 LLVMBuildExtractElement(builder, a, index, ""),
-                                 LLVMBuildExtractElement(builder, b, index, ""),
-                                 "");
-            cond = LLVMBuildSelect(builder, cond,
-                                   LLVMConstExtractElement(ones, index),
-                                   LLVMConstExtractElement(zeros, index),
-                                   "");
-            res = LLVMBuildInsertElement(builder, res, cond, index, "");
-         }
-      }
-#endif
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   if(func == PIPE_FUNC_NEVER)
+      return zeros;
+   if(func == PIPE_FUNC_ALWAYS)
+      return ones;
+
+   assert(func > PIPE_FUNC_NEVER);
+   assert(func < PIPE_FUNC_ALWAYS);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * There are no unsigned integer comparison instructions in SSE.
+    */
+
+   if (!type.floating && !type.sign &&
+       type.width * type.length == 128 &&
+       util_cpu_caps.has_sse2 &&
+       (func == PIPE_FUNC_LESS ||
+        func == PIPE_FUNC_LEQUAL ||
+        func == PIPE_FUNC_GREATER ||
+        func == PIPE_FUNC_GEQUAL) &&
+       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
+                      __FUNCTION__, type.length, type.width);
     }
+#endif
  
-   return res;
+   return lp_build_compare_ext(gallivm, type, func, a, b, FALSE);
  }
  
-
+/**
+ * Build code to compare two values 'a' and 'b' using the given func.
+ * \param func  one of PIPE_FUNC_x
+ * If the operands are floating point numbers, the function will use
+ * ordered comparison which means that it will return true if both
+ * operands are not a NaN and the specified condition evaluates to true.
+ * The result values will be 0 for false or ~0 for true.
+ */
+LLVMValueRef
+lp_build_cmp_ordered(struct lp_build_context *bld,
+                     unsigned func,
+                     LLVMValueRef a,
+                     LLVMValueRef b)
+{
+   return lp_build_compare_ext(bld->gallivm, bld->type, func, a, b, TRUE);
+}
  
  /**
   * Build code to compare two values 'a' and 'b' using the given func.
   * \param func  one of PIPE_FUNC_x
+ * If the operands are floating point numbers, the function will use
+ * unordered comparison which means that it will return true if either
+ * operand is a NaN or the specified condition evaluates to true.
   * The result values will be 0 for false or ~0 for true.
   */
  LLVMValueRef
@@ -397,6 +257,7 @@ lp_build_select_bitwise(struct lp_build_context *bld,
     LLVMBuilderRef builder = bld->gallivm->builder;
     struct lp_type type = bld->type;
     LLVMValueRef res;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  
     assert(lp_check_value(type, a));
     assert(lp_check_value(type, b));
@@ -406,11 +267,12 @@ lp_build_select_bitwise(struct lp_build_context *bld,
     }
  
     if(type.floating) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
        a = LLVMBuildBitCast(builder, a, int_vec_type, "");
        b = LLVMBuildBitCast(builder, b, int_vec_type, "");
     }
  
+   if (type.width > 32)
+      mask = LLVMBuildSExt(builder, mask, int_vec_type, "");
     a = LLVMBuildAnd(builder, a, mask, "");
  
     /* This often gets translated to PANDN, but sometimes the NOT is
@@ -458,37 +320,40 @@ lp_build_select(struct lp_build_context *bld,
        mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
        res = LLVMBuildSelect(builder, mask, a, b, "");
     }
-   else if (0) {
+   else if (LLVMIsConstant(mask) ||
+            LLVMGetInstructionOpcode(mask) == LLVMSExt) {
        /* Generate a vector select.
         *
-       * XXX: Using vector selects would avoid emitting intrinsics, but they aren't
-       * properly supported yet.
-       *
-       * LLVM 3.0 includes experimental support provided the -promote-elements
-       * options is passed to LLVM's command line (e.g., via
-       * llvm::cl::ParseCommandLineOptions), but resulting code quality is much
-       * worse, probably because some optimization passes don't know how to
-       * handle vector selects.
-       *
-       * See also:
-       * - http://lists.cs.uiuc.edu/pipermail/llvmdev/2011-October/043659.html
+       * Using vector selects should avoid emitting intrinsics hence avoid
+       * hindering optimization passes, but vector selects weren't properly
+       * supported yet for a long time, and LLVM will generate poor code when
+       * the mask is not the result of a comparison.
+       * XXX: Even if the instruction was an SExt, this may still produce
+       * terrible code. Try piglit stencil-twoside.
         */
  
        /* Convert the mask to a vector of booleans.
-       * XXX: There are two ways to do this. Decide what's best.
+       *
+       * XXX: In x86 the mask is controlled by the MSB, so if we shifted the
+       * mask by `type.width - 1`, LLVM should realize the mask is ready.  Alas
+       * what really happens is that LLVM will emit two shifts back to back.
         */
-      if (1) {
-         LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length);
-         mask = LLVMBuildTrunc(builder, mask, bool_vec_type, "");
-      } else {
-         mask = LLVMBuildICmp(builder, LLVMIntNE, mask, LLVMConstNull(bld->int_vec_type), "");
+      if (0) {
+         LLVMValueRef shift = LLVMConstInt(bld->int_elem_type, bld->type.width - 1, 0);
+         shift = lp_build_broadcast(bld->gallivm, bld->int_vec_type, shift);
+         mask = LLVMBuildLShr(builder, mask, shift, "");
        }
+      LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length);
+      mask = LLVMBuildTrunc(builder, mask, bool_vec_type, "");
+
        res = LLVMBuildSelect(builder, mask, a, b, "");
     }
     else if (((util_cpu_caps.has_sse4_1 &&
                type.width * type.length == 128) ||
               (util_cpu_caps.has_avx &&
-              type.width * type.length == 256 && type.width >= 32)) &&
+              type.width * type.length == 256 && type.width >= 32) ||
+             (util_cpu_caps.has_avx2 &&
+              type.width * type.length == 256)) &&
              !LLVMIsConstant(a) &&
              !LLVMIsConstant(b) &&
              !LLVMIsConstant(mask)) {
@@ -496,6 +361,11 @@ lp_build_select(struct lp_build_context *bld,
        LLVMTypeRef arg_type;
        LLVMValueRef args[3];
  
+      LLVMTypeRef mask_type = LLVMGetElementType(LLVMTypeOf(mask));
+      if (LLVMGetIntTypeWidth(mask_type) != type.width) {
+         LLVMTypeRef int_vec_type = LLVMVectorType(LLVMIntTypeInContext(lc, type.width), type.length);
+         mask = LLVMBuildSExt(builder, mask, int_vec_type, "");
+      }
        /*
         *  There's only float blend in AVX but can just cast i32/i64
         *  to float.
@@ -505,9 +375,13 @@ lp_build_select(struct lp_build_context *bld,
             intrinsic = "llvm.x86.avx.blendv.pd.256";
             arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
           }
-         else {
+         else if (type.width == 32) {
              intrinsic = "llvm.x86.avx.blendv.ps.256";
              arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+         } else {
+            assert(util_cpu_caps.has_avx2);
+            intrinsic = "llvm.x86.avx2.pblendvb";
+            arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
           }
        }
        else if (type.floating &&
@@ -537,7 +411,7 @@ lp_build_select(struct lp_build_context *bld,
        args[2] = mask;
  
        res = lp_build_intrinsic(builder, intrinsic,
-                               arg_type, args, Elements(args));
+                               arg_type, args, ARRAY_SIZE(args), 0);
  
        if (arg_type != bld->vec_type) {
           res = LLVMBuildBitCast(builder, res, bld->vec_type, "");