gallivm: enhance special sse2 4x4f and 2x8f -> 1x16ub conversion

author Roland Scheidegger <sroland@vmware.com>

Tue, 4 Jun 2013 22:11:45 +0000 (00:11 +0200)

committer Roland Scheidegger <sroland@vmware.com>

Tue, 4 Jun 2013 22:29:46 +0000 (00:29 +0200)
author Roland Scheidegger <sroland@vmware.com>
Tue, 4 Jun 2013 22:11:45 +0000 (00:11 +0200)
committer Roland Scheidegger <sroland@vmware.com>
Tue, 4 Jun 2013 22:29:46 +0000 (00:29 +0200)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c

index eb2d09638fa0e96bd423bdd3d9f1d93d17a19761..f11361ac06ba85a198d22b2e38d99541ce1b7b37 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -530,24 +530,22 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
         dst_type->width    == 8)
     {
        /* Special case 4x4f --> 1x16ub */
-      if (src_type.length == 4 && util_cpu_caps.has_sse2)
+      if (src_type.length == 4 &&
+          util_cpu_caps.has_sse2)
        {
-         assert((num_srcs % 4) == 0);
-
-         num_dsts = num_srcs / 4;
-         dst_type->length = 16;
+         num_dsts = (num_srcs + 3) / 4;
+         dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
  
           lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
           return num_dsts;
        }
  
        /* Special case 2x8f --> 1x16ub */
-      if (src_type.length == 8 && util_cpu_caps.has_avx)
+      if (src_type.length == 8 &&
+          util_cpu_caps.has_avx)
        {
-         assert((num_srcs % 2) == 0);
-
-         num_dsts = num_srcs / 2;
-         dst_type->length = 16;
+         num_dsts = (num_srcs + 1) / 2;
+         dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
  
           lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
           return num_dsts;
@@ -602,7 +600,7 @@ lp_build_conv(struct gallivm_state *gallivm,
     num_tmps = num_srcs;
  
  
-   /* Special case 4x4f --> 1x16ub 
+   /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
      */
     if (src_type.floating == 1 &&
         src_type.fixed    == 0 &&
@@ -616,20 +614,23 @@ lp_build_conv(struct gallivm_state *gallivm,
         dst_type.sign     == 0 &&
         dst_type.norm     == 1 &&
         dst_type.width    == 8 &&
-       dst_type.length   == 16 &&
  
-       4 * num_dsts      == num_srcs &&
+       ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
+        (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
  
         util_cpu_caps.has_sse2)
     {
        struct lp_build_context bld;
-      struct lp_type int16_type = dst_type;
-      struct lp_type int32_type = dst_type;
+      struct lp_type int16_type, int32_type;
+      struct lp_type dst_type_ext = dst_type;
        LLVMValueRef const_255f;
        unsigned i, j;
  
        lp_build_context_init(&bld, gallivm, src_type);
  
+      dst_type_ext.length = 16;
+      int16_type = int32_type = dst_type_ext;
+
        int16_type.width *= 2;
        int16_type.length /= 2;
        int16_type.sign = 1;
@@ -643,21 +644,34 @@ lp_build_conv(struct gallivm_state *gallivm,
        for (i = 0; i < num_dsts; ++i, src += 4) {
           LLVMValueRef lo, hi;
  
-         for (j = 0; j < 4; ++j) {
+         for (j = 0; j < dst_type.length / 4; ++j) {
              tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
              tmp[j] = lp_build_iround(&bld, tmp[j]);
           }
  
+         if (num_srcs == 1) {
+            tmp[1] = tmp[0];
+         }
+
           /* relying on clamping behavior of sse2 intrinsics here */
           lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
-         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+
+         if (num_srcs < 4) {
+            hi = lo;
+         }
+         else {
+            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+         }
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+      }
+      if (num_srcs < 4) {
+         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
        }
  
        return; 
     }
  
-   /* Special case 2x8f --> 1x16ub
+   /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
      */
     else if (src_type.floating == 1 &&
        src_type.fixed    == 0 &&
@@ -671,20 +685,23 @@ lp_build_conv(struct gallivm_state *gallivm,
        dst_type.sign     == 0 &&
        dst_type.norm     == 1 &&
        dst_type.width    == 8 &&
-      dst_type.length   == 16 &&
  
-      2 * num_dsts      == num_srcs &&
+      ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
+       (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
  
        util_cpu_caps.has_avx) {
  
        struct lp_build_context bld;
-      struct lp_type int16_type = dst_type;
-      struct lp_type int32_type = dst_type;
+      struct lp_type int16_type, int32_type;
+      struct lp_type dst_type_ext = dst_type;
        LLVMValueRef const_255f;
        unsigned i;
  
        lp_build_context_init(&bld, gallivm, src_type);
  
+      dst_type_ext.length = 16;
+      int16_type = int32_type = dst_type_ext;
+
        int16_type.width *= 2;
        int16_type.length /= 2;
        int16_type.sign = 1;
@@ -699,21 +716,30 @@ lp_build_conv(struct gallivm_state *gallivm,
           LLVMValueRef lo, hi, a, b;
  
           a = LLVMBuildFMul(builder, src[0], const_255f, "");
-         b = LLVMBuildFMul(builder, src[1], const_255f, "");
-
           a = lp_build_iround(&bld, a);
-         b = lp_build_iround(&bld, b);
-
           tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
           tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
-         tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
-         tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
-
           /* relying on clamping behavior of sse2 intrinsics here */
           lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
-         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+
+         if (num_srcs == 1) {
+            hi = lo;
+         }
+         else {
+            b = LLVMBuildFMul(builder, src[1], const_255f, "");
+            b = lp_build_iround(&bld, b);
+            tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+            tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+
+         }
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
        }
+
+      if (num_srcs == 1) {
+         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
+      }
+
        return;
     }
  
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c

index 71d45bd5ce7deb51ae577151a690210e0f2d59c3..e2ec6e2bb45b6e33cc5b8d4d2bf33e3cce187e97 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -377,6 +377,8 @@ const struct lp_type conv_types[] = {
     {  FALSE, FALSE,  TRUE, FALSE,     8,   4 },
     {  FALSE, FALSE, FALSE,  TRUE,     8,   4 },
     {  FALSE, FALSE, FALSE, FALSE,     8,   4 },
+
+   {  FALSE, FALSE,  FALSE,  TRUE,    8,   8 },
  };
author	Roland Scheidegger <sroland@vmware.com>
	Tue, 4 Jun 2013 22:11:45 +0000 (00:11 +0200)
committer	Roland Scheidegger <sroland@vmware.com>
	Tue, 4 Jun 2013 22:29:46 +0000 (00:29 +0200)
src/gallium/auxiliary/gallivm/lp_bld_conv.c		patch \| blob \| history
src/gallium/drivers/llvmpipe/lp_test_conv.c		patch \| blob \| history