gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path.
authorJosé Fonseca <jfonseca@vmware.com>
Sun, 21 Apr 2013 21:23:31 +0000 (22:23 +0100)
committerJosé Fonseca <jfonseca@vmware.com>
Fri, 17 May 2013 19:23:00 +0000 (20:23 +0100)
This change was meant as a stepping stone to use PMADDUBSW SSSE3
instruction, but actually this refactoring by itself yields a 10%
speedup on texture intensive shaders (e.g, Google Earth's ocean water
w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the
same results, whereas PMADDUBSW only gave an extra 5%, at the expense of
2bits of precision in the interpolation.

I belive that the speedup of this change comes from the reduced register
pressure (as 8.8 fixed point intermediates take twice the space of 8bit
unorm).

Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code
substantially -- it's no longer necessary to have code duplicated for
low and high register halfs.

Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never
executed (as it is faster on AVX to split the 256bit wide texture
computation into two 128bit chunks, in order to leverage integer
opcodes).  This path might be useful in the future, so in order to
verify this change did not break that path I had to apply this change:

  @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
         /*
          * we only try 8-wide sampling with soa as it appears to
          * be a loss with aos with AVX (but it should work).
          * (It should be faster if we'd support avx2)
          */
  -      if (num_quads == 1 || !use_aos) {
  +      if (/* num_quads == 1 || ! */ use_aos) {

            if (num_quads > 1) {
               if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
                  LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
                  /*

and then run texfilt mesademo:

  LP_NATIVE_VECTOR_WIDTH=256 ./texfilt

Ran whole piglit without regressions.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
src/gallium/auxiliary/gallivm/lp_bld_arit.c
src/gallium/auxiliary/gallivm/lp_bld_arit.h
src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
src/gallium/drivers/llvmpipe/lp_bld_blend.c

index 8f8410c015bdaf5821f168c3f0808b68b415d3ee..3291ec40af59021e3f4a6c1d76f5e0a22148a3dd 100644 (file)
@@ -974,7 +974,7 @@ lp_build_lerp_simple(struct lp_build_context *bld,
                      LLVMValueRef x,
                      LLVMValueRef v0,
                      LLVMValueRef v1,
-                     bool normalized)
+                     unsigned flags)
 {
    unsigned half_width = bld->type.width/2;
    LLVMBuilderRef builder = bld->gallivm->builder;
@@ -987,14 +987,17 @@ lp_build_lerp_simple(struct lp_build_context *bld,
 
    delta = lp_build_sub(bld, v1, v0);
 
-   if (normalized) {
+   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
       if (!bld->type.sign) {
-         /*
-          * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
-          * most-significant-bit to the lowest-significant-bit, so that
-          * later we can just divide by 2**n instead of 2**n - 1.
-          */
-         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
+         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
+            /*
+             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
+             * most-significant-bit to the lowest-significant-bit, so that
+             * later we can just divide by 2**n instead of 2**n - 1.
+             */
+
+            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
+         }
 
          /* (x * delta) >> n */
          res = lp_build_mul(bld, x, delta);
@@ -1005,15 +1008,18 @@ lp_build_lerp_simple(struct lp_build_context *bld,
           * use the 2**n - 1 divison approximation in lp_build_mul_norm
           * instead.
           */
+         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
       }
    } else {
+      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
       res = lp_build_mul(bld, x, delta);
    }
 
    res = lp_build_add(bld, v0, res);
 
-   if ((normalized && !bld->type.sign) || bld->type.fixed) {
+   if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
+       bld->type.fixed) {
       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
        * but it will be wrong for true fixed point use cases. Basically we need
@@ -1033,7 +1039,8 @@ LLVMValueRef
 lp_build_lerp(struct lp_build_context *bld,
               LLVMValueRef x,
               LLVMValueRef v0,
-              LLVMValueRef v1)
+              LLVMValueRef v1,
+              unsigned flags)
 {
    const struct lp_type type = bld->type;
    LLVMValueRef res;
@@ -1042,6 +1049,8 @@ lp_build_lerp(struct lp_build_context *bld,
    assert(lp_check_value(type, v0));
    assert(lp_check_value(type, v1));
 
+   assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
+
    if (type.norm) {
       struct lp_type wide_type;
       struct lp_build_context wide_bld;
@@ -1068,18 +1077,25 @@ lp_build_lerp(struct lp_build_context *bld,
        * Lerp both halves.
        */
 
-      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, TRUE);
-      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, TRUE);
+      flags |= LP_BLD_LERP_WIDE_NORMALIZED;
+
+      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
+      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
 
       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
    } else {
-      res = lp_build_lerp_simple(bld, x, v0, v1, FALSE);
+      res = lp_build_lerp_simple(bld, x, v0, v1, flags);
    }
 
    return res;
 }
 
 
+/**
+ * Bilinear interpolation.
+ *
+ * Values indices are in v_{yx}.
+ */
 LLVMValueRef
 lp_build_lerp_2d(struct lp_build_context *bld,
                  LLVMValueRef x,
@@ -1087,11 +1103,12 @@ lp_build_lerp_2d(struct lp_build_context *bld,
                  LLVMValueRef v00,
                  LLVMValueRef v01,
                  LLVMValueRef v10,
-                 LLVMValueRef v11)
+                 LLVMValueRef v11,
+                 unsigned flags)
 {
-   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
-   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
-   return lp_build_lerp(bld, y, v0, v1);
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
+   return lp_build_lerp(bld, y, v0, v1, flags);
 }
 
 
@@ -1107,11 +1124,12 @@ lp_build_lerp_3d(struct lp_build_context *bld,
                  LLVMValueRef v100,
                  LLVMValueRef v101,
                  LLVMValueRef v110,
-                 LLVMValueRef v111)
+                 LLVMValueRef v111,
+                 unsigned flags)
 {
-   LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011);
-   LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111);
-   return lp_build_lerp(bld, z, v0, v1);
+   LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
+   LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
+   return lp_build_lerp(bld, z, v0, v1, flags);
 }
 
 
index 45886d5fd99e9bb68be7dcf434a39a1201219757..966796c3c4d190608519f8ddda203d85409d6235 100644 (file)
@@ -85,17 +85,27 @@ lp_build_div(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
 
+
+/**
+ * Set when the weights for normalized are prescaled, that is, in range
+ * 0..2**n, as opposed to range 0..2**(n-1).
+ */
+#define LP_BLD_LERP_PRESCALED_WEIGHTS (1 << 0)
+
+/**
+ * Used internally when using wide intermediates for normalized lerps.
+ *
+ * Do not use.
+ */
+#define LP_BLD_LERP_WIDE_NORMALIZED (1 << 1)
+
 LLVMValueRef
 lp_build_lerp(struct lp_build_context *bld,
               LLVMValueRef x,
               LLVMValueRef v0,
-              LLVMValueRef v1);
+              LLVMValueRef v1,
+              unsigned flags);
 
-/**
- * Bilinear interpolation.
- *
- * Values indices are in v_{yx}.
- */
 LLVMValueRef
 lp_build_lerp_2d(struct lp_build_context *bld,
                  LLVMValueRef x,
@@ -103,7 +113,8 @@ lp_build_lerp_2d(struct lp_build_context *bld,
                  LLVMValueRef v00,
                  LLVMValueRef v01,
                  LLVMValueRef v10,
-                 LLVMValueRef v11);
+                 LLVMValueRef v11,
+                 unsigned flags);
 
 LLVMValueRef
 lp_build_lerp_3d(struct lp_build_context *bld,
@@ -117,7 +128,8 @@ lp_build_lerp_3d(struct lp_build_context *bld,
                  LLVMValueRef v100,
                  LLVMValueRef v101,
                  LLVMValueRef v110,
-                 LLVMValueRef v111);
+                 LLVMValueRef v111,
+                 unsigned flags);
 
 
 LLVMValueRef
index 9eaca029fdadf185408b2f65fbcd3e5b0d66f776..c31b05d7022d7638ecf924a7aac16e4f1987cba1 100644 (file)
@@ -496,8 +496,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                                     LLVMValueRef offset,
                                     LLVMValueRef x_subcoord,
                                     LLVMValueRef y_subcoord,
-                                    LLVMValueRef *colors_lo,
-                                    LLVMValueRef *colors_hi)
+                                    LLVMValueRef *colors)
 {
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
@@ -517,10 +516,9 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
     */
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMValueRef rgba8;
-   struct lp_build_context h16, u8n;
+   struct lp_build_context u8n;
    LLVMTypeRef u8n_vec_type;
 
-   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
@@ -546,10 +544,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                                       y_subcoord);
    }
 
-   /* Expand one 4*rgba8 to two 2*rgba16 */
-   lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
-                    rgba8,
-                    colors_lo, colors_hi);
+   *colors = rgba8;
 }
 
 
@@ -569,8 +564,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               LLVMValueRef t,
                               LLVMValueRef r,
                               const LLVMValueRef *offsets,
-                              LLVMValueRef *colors_lo,
-                              LLVMValueRef *colors_hi)
+                              LLVMValueRef *colors)
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
@@ -694,7 +688,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 
    lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
                                        x_subcoord, y_subcoord,
-                                       colors_lo, colors_hi);
+                                       colors);
 }
 
 
@@ -716,8 +710,7 @@ lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
                                      LLVMValueRef t,
                                      LLVMValueRef r,
                                      const LLVMValueRef *offsets,
-                                     LLVMValueRef *colors_lo,
-                                     LLVMValueRef *colors_hi)
+                                     LLVMValueRef *colors)
    {
    const unsigned dims = bld->dims;
    LLVMValueRef width_vec, height_vec, depth_vec;
@@ -787,7 +780,7 @@ lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
 
    lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
                                        x_subcoord, y_subcoord,
-                                       colors_lo, colors_hi);
+                                       colors);
 }
 
 
@@ -804,29 +797,21 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                    LLVMValueRef s_fpart,
                                    LLVMValueRef t_fpart,
                                    LLVMValueRef r_fpart,
-                                   LLVMValueRef *colors_lo,
-                                   LLVMValueRef *colors_hi)
+                                   LLVMValueRef *colors)
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context h16, u8n;
-   LLVMTypeRef h16_vec_type, u8n_vec_type;
+   struct lp_build_context u8n;
+   LLVMTypeRef u8n_vec_type;
    LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
-   LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
-   LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
-   LLVMValueRef shuffle_lo, shuffle_hi;
-   LLVMValueRef s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
-   LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
-   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
-   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
-   LLVMValueRef packed_lo, packed_hi;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef shuffle;
+   LLVMValueRef neighbors[2][2][2]; /* [z][y][x] */
+   LLVMValueRef packed;
    unsigned i, j, k;
    unsigned numj, numk;
 
-   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
-   h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
    /*
@@ -834,59 +819,45 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
     *
     *   s_fpart = {s0, s1, s2, s3}
     *
-    * into 8 x i16
-    *
-    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    * where each value is between 0 and 0xff,
     *
-    * into two 8 x i16
+    * into one 16 x i20
     *
-    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
-    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *   s_fpart = {s0, s0, s0, s0, s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3}
     *
     * and likewise for t_fpart. There is no risk of loosing precision here
     * since the fractional parts only use the lower 8bits.
     */
-   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, u8n_vec_type, "");
    if (dims >= 2)
-      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+      t_fpart = LLVMBuildBitCast(builder, t_fpart, u8n_vec_type, "");
    if (dims >= 3)
-      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+      r_fpart = LLVMBuildBitCast(builder, r_fpart, u8n_vec_type, "");
 
-   for (j = 0; j < h16.type.length; j += 4) {
+   for (j = 0; j < u8n.type.length; j += 4) {
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
       unsigned subindex = 0;
 #else
-      unsigned subindex = 1;
+      unsigned subindex = 3;
 #endif
       LLVMValueRef index;
 
-      index = LLVMConstInt(elem_type, j/2 + subindex, 0);
-      for (i = 0; i < 4; ++i)
-         shuffles_lo[j + i] = index;
-
-      index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+      index = LLVMConstInt(elem_type, j + subindex, 0);
       for (i = 0; i < 4; ++i)
-         shuffles_hi[j + i] = index;
+         shuffles[j + i] = index;
    }
 
-   shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
-   shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+   shuffle = LLVMConstVector(shuffles, u8n.type.length);
 
-   s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
-                                       shuffle_lo, "");
-   s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
-                                       shuffle_hi, "");
+   s_fpart = LLVMBuildShuffleVector(builder, s_fpart, u8n.undef,
+                                    shuffle, "");
    if (dims >= 2) {
-      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
-                                          shuffle_lo, "");
-      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
-                                          shuffle_hi, "");
+      t_fpart = LLVMBuildShuffleVector(builder, t_fpart, u8n.undef,
+                                       shuffle, "");
    }
    if (dims >= 3) {
-      r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
-                                          shuffle_lo, "");
-      r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
-                                          shuffle_hi, "");
+      r_fpart = LLVMBuildShuffleVector(builder, r_fpart, u8n.undef,
+                                       shuffle, "");
    }
 
    /*
@@ -935,10 +906,7 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                                y_subcoord[j]);
             }
 
-            /* Expand one 4*rgba8 to two 2*rgba16 */
-            lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
-                             rgba8,
-                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+            neighbors[k][j][i] = rgba8;
          }
       }
    }
@@ -948,84 +916,55 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
     */
    if (bld->static_sampler_state->force_nearest_s) {
       /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                t_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
-
-      packed_hi = lp_build_lerp(&h16,
-                                t_fpart_hi,
-                                neighbors_hi[0][1][0],
-                                neighbors_hi[0][1][0]);
+      packed = lp_build_lerp(&u8n,
+                             t_fpart,
+                             neighbors[0][0][0],
+                             neighbors[0][0][1],
+                             LP_BLD_LERP_PRESCALED_WEIGHTS);
    }
    else if (bld->static_sampler_state->force_nearest_t) {
       /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                s_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
-
-      packed_hi = lp_build_lerp(&h16,
-                                s_fpart_hi,
-                                neighbors_hi[0][0][0],
-                                neighbors_hi[0][0][1]);
+      packed = lp_build_lerp(&u8n,
+                             s_fpart,
+                             neighbors[0][0][0],
+                             neighbors[0][0][1],
+                             LP_BLD_LERP_PRESCALED_WEIGHTS);
    }
    else {
       /* general 1/2/3-D lerping */
       if (dims == 1) {
-         packed_lo = lp_build_lerp(&h16,
-                                   s_fpart_lo,
-                                   neighbors_lo[0][0][0],
-                                   neighbors_lo[0][0][1]);
-
-         packed_hi = lp_build_lerp(&h16,
-                                   s_fpart_hi,
-                                   neighbors_hi[0][0][0],
-                                   neighbors_hi[0][0][1]);
+         packed = lp_build_lerp(&u8n,
+                                s_fpart,
+                                neighbors[0][0][0],
+                                neighbors[0][0][1],
+                                LP_BLD_LERP_PRESCALED_WEIGHTS);
       } else if (dims == 2) {
          /* 2-D lerp */
-         packed_lo = lp_build_lerp_2d(&h16,
-                                      s_fpart_lo, t_fpart_lo,
-                                      neighbors_lo[0][0][0],
-                                      neighbors_lo[0][0][1],
-                                      neighbors_lo[0][1][0],
-                                      neighbors_lo[0][1][1]);
-
-         packed_hi = lp_build_lerp_2d(&h16,
-                                      s_fpart_hi, t_fpart_hi,
-                                      neighbors_hi[0][0][0],
-                                      neighbors_hi[0][0][1],
-                                      neighbors_hi[0][1][0],
-                                      neighbors_hi[0][1][1]);
+         packed = lp_build_lerp_2d(&u8n,
+                                   s_fpart, t_fpart,
+                                   neighbors[0][0][0],
+                                   neighbors[0][0][1],
+                                   neighbors[0][1][0],
+                                   neighbors[0][1][1],
+                                   LP_BLD_LERP_PRESCALED_WEIGHTS);
       } else {
          /* 3-D lerp */
          assert(dims == 3);
-         packed_lo = lp_build_lerp_3d(&h16,
-                                      s_fpart_lo, t_fpart_lo, r_fpart_lo,
-                                      neighbors_lo[0][0][0],
-                                      neighbors_lo[0][0][1],
-                                      neighbors_lo[0][1][0],
-                                      neighbors_lo[0][1][1],
-                                      neighbors_lo[1][0][0],
-                                      neighbors_lo[1][0][1],
-                                      neighbors_lo[1][1][0],
-                                      neighbors_lo[1][1][1]);
-
-         packed_hi = lp_build_lerp_3d(&h16,
-                                      s_fpart_hi, t_fpart_hi, r_fpart_hi,
-                                      neighbors_hi[0][0][0],
-                                      neighbors_hi[0][0][1],
-                                      neighbors_hi[0][1][0],
-                                      neighbors_hi[0][1][1],
-                                      neighbors_hi[1][0][0],
-                                      neighbors_hi[1][0][1],
-                                      neighbors_hi[1][1][0],
-                                      neighbors_hi[1][1][1]);
+         packed = lp_build_lerp_3d(&u8n,
+                                   s_fpart, t_fpart, r_fpart,
+                                   neighbors[0][0][0],
+                                   neighbors[0][0][1],
+                                   neighbors[0][1][0],
+                                   neighbors[0][1][1],
+                                   neighbors[1][0][0],
+                                   neighbors[1][0][1],
+                                   neighbors[1][1][0],
+                                   neighbors[1][1][1],
+                                   LP_BLD_LERP_PRESCALED_WEIGHTS);
       }
    }
 
-   *colors_lo = packed_lo;
-   *colors_hi = packed_hi;
+   *colors = packed;
 }
 
 /**
@@ -1043,8 +982,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              LLVMValueRef t,
                              LLVMValueRef r,
                              const LLVMValueRef *offsets,
-                             LLVMValueRef *colors_lo,
-                             LLVMValueRef *colors_hi)
+                             LLVMValueRef *colors)
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
@@ -1223,7 +1161,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
                                       x_subcoord, y_subcoord,
                                       s_fpart, t_fpart, r_fpart,
-                                      colors_lo, colors_hi);
+                                      colors);
 }
 
 
@@ -1244,8 +1182,7 @@ lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
                                     LLVMValueRef t,
                                     LLVMValueRef r,
                                     const LLVMValueRef *offsets,
-                                    LLVMValueRef *colors_lo,
-                                    LLVMValueRef *colors_hi)
+                                    LLVMValueRef *colors)
 {
    const unsigned dims = bld->dims;
    LLVMValueRef width_vec, height_vec, depth_vec;
@@ -1395,7 +1332,7 @@ lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
    lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
                                       x_subcoord, y_subcoord,
                                       s_fpart, t_fpart, r_fpart,
-                                      colors_lo, colors_hi);
+                                      colors);
 }
 
 
@@ -1416,8 +1353,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef ilevel0,
                        LLVMValueRef ilevel1,
                        LLVMValueRef lod_fpart,
-                       LLVMValueRef colors_lo_var,
-                       LLVMValueRef colors_hi_var)
+                       LLVMValueRef colors_var)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMValueRef size0;
@@ -1430,8 +1366,8 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    LLVMValueRef data_ptr1;
    LLVMValueRef mipoff0 = NULL;
    LLVMValueRef mipoff1 = NULL;
-   LLVMValueRef colors0_lo, colors0_hi;
-   LLVMValueRef colors1_lo, colors1_hi;
+   LLVMValueRef colors0;
+   LLVMValueRef colors1;
 
    /* sample the first mipmap level */
    lp_build_mipmap_level_sizes(bld, ilevel0,
@@ -1452,7 +1388,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                               size0,
                                               row_stride0_vec, img_stride0_vec,
                                               data_ptr0, mipoff0, s, t, r, offsets,
-                                              &colors0_lo, &colors0_hi);
+                                              &colors0);
       }
       else {
          assert(img_filter == PIPE_TEX_FILTER_LINEAR);
@@ -1460,7 +1396,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                              size0,
                                              row_stride0_vec, img_stride0_vec,
                                              data_ptr0, mipoff0, s, t, r, offsets,
-                                             &colors0_lo, &colors0_hi);
+                                             &colors0);
       }
    }
    else {
@@ -1469,7 +1405,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                        size0,
                                        row_stride0_vec, img_stride0_vec,
                                        data_ptr0, mipoff0, s, t, r, offsets,
-                                       &colors0_lo, &colors0_hi);
+                                       &colors0);
       }
       else {
          assert(img_filter == PIPE_TEX_FILTER_LINEAR);
@@ -1477,13 +1413,12 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                       size0,
                                       row_stride0_vec, img_stride0_vec,
                                       data_ptr0, mipoff0, s, t, r, offsets,
-                                      &colors0_lo, &colors0_hi);
+                                      &colors0);
       }
    }
 
    /* Store the first level's colors in the output variables */
-   LLVMBuildStore(builder, colors0_lo, colors_lo_var);
-   LLVMBuildStore(builder, colors0_hi, colors_hi_var);
+   LLVMBuildStore(builder, colors0, colors_var);
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
@@ -1522,9 +1457,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
       {
-         struct lp_build_context h16_bld;
+         struct lp_build_context u8n_bld;
 
-         lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+         lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
 
          /* sample the second mipmap level */
          lp_build_mipmap_level_sizes(bld, ilevel1,
@@ -1547,14 +1482,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                                     size1,
                                                     row_stride1_vec, img_stride1_vec,
                                                     data_ptr1, mipoff1, s, t, r, offsets,
-                                                    &colors1_lo, &colors1_hi);
+                                                    &colors1);
             }
             else {
                lp_build_sample_image_linear_afloat(bld,
                                                    size1,
                                                    row_stride1_vec, img_stride1_vec,
                                                    data_ptr1, mipoff1, s, t, r, offsets,
-                                                   &colors1_lo, &colors1_hi);
+                                                   &colors1);
             }
          }
          else {
@@ -1563,73 +1498,55 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                              size1,
                                              row_stride1_vec, img_stride1_vec,
                                              data_ptr1, mipoff1, s, t, r, offsets,
-                                             &colors1_lo, &colors1_hi);
+                                             &colors1);
             }
             else {
                lp_build_sample_image_linear(bld,
                                             size1,
                                             row_stride1_vec, img_stride1_vec,
                                             data_ptr1, mipoff1, s, t, r, offsets,
-                                            &colors1_lo, &colors1_hi);
+                                            &colors1);
             }
          }
 
          /* interpolate samples from the two mipmap levels */
 
          if (num_quads == 1) {
-            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
-            lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, "");
+            lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart);
 
 #if HAVE_LLVM == 0x208
-            /* This is a work-around for a bug in LLVM 2.8.
+            /* This was a work-around for a bug in LLVM 2.8.
              * Evidently, something goes wrong in the construction of the
              * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
              * to force the vector to be properly constructed.
              * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
              */
-            {
-               LLVMValueRef shuffles[8], shuffle;
-               assert(h16_bld.type.length <= Elements(shuffles));
-               for (i = 0; i < h16_bld.type.length; i++)
-                  shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
-               shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
-               lod_fpart = LLVMBuildShuffleVector(builder,
-                                                  lod_fpart, lod_fpart,
-                                                  shuffle, "");
-            }
+#error Unsupported
 #endif
-
-            colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
-                                       colors0_lo, colors1_lo);
-            colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
-                                       colors0_hi, colors1_hi);
          }
          else {
-            LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
-            struct lp_type perquadi16_type = bld->perquadi_bld.type;
-            perquadi16_type.width /= 2;
-            perquadi16_type.length *= 2;
-            lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
-                                         lp_build_vec_type(bld->gallivm,
-                                                           perquadi16_type), "");
-            /* XXX this only works for exactly 2 quads. More quads need shuffle */
-            assert(num_quads == 2);
-            for (i = 0; i < num_quads; i++) {
-               LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
-               lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
-                                                         perquadi16_type,
-                                                         h16_bld.type,
-                                                         lod_fpart,
-                                                         indexi2);
+            const unsigned num_chans_per_quad = 4 * 4;
+            LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->perquadi_bld.type.length);
+            LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH];
+
+            /* Take the LSB of lod_fpart */
+            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, "");
+
+            /* Broadcast each lod weight into their respective channels */
+            assert(u8n_bld.type.length == num_quads * num_chans_per_quad);
+            for (i = 0; i < u8n_bld.type.length; ++i) {
+               shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_quad);
             }
-            colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
-                                       colors0_lo, colors1_lo);
-            colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
-                                       colors0_hi, colors1_hi);
+            lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type),
+                                               LLVMConstVector(shuffle, u8n_bld.type.length), "");
          }
 
-         LLVMBuildStore(builder, colors0_lo, colors_lo_var);
-         LLVMBuildStore(builder, colors0_hi, colors_hi_var);
+         colors0 = lp_build_lerp(&u8n_bld, lod_fpart,
+                                 colors0, colors1,
+                                 LP_BLD_LERP_PRESCALED_WEIGHTS);
+
+         LLVMBuildStore(builder, colors0, colors_var);
       }
       lp_build_endif(&if_ctx);
    }
@@ -1661,9 +1578,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
    const unsigned dims = bld->dims;
-   LLVMValueRef packed, packed_lo, packed_hi;
+   LLVMValueRef packed_var, packed;
    LLVMValueRef unswizzled[4];
-   struct lp_build_context h16_bld;
+   struct lp_build_context u8n_bld;
 
    /* we only support the common/simple wrap modes at this time */
    assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_s));
@@ -1673,15 +1590,14 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_r));
 
 
-   /* make 16-bit fixed-pt builder context */
-   lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+   /* make 8-bit unorm builder context */
+   lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
 
    /*
     * Get/interpolate texture colors.
     */
 
-   packed_lo = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_lo");
-   packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
+   packed_var = lp_build_alloca(bld->gallivm, u8n_bld.vec_type, "packed_var");
 
    if (min_filter == mag_filter) {
       /* no need to distinguish between minification and magnification */
@@ -1689,7 +1605,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                              min_filter, mip_filter,
                              s, t, r, offsets,
                              ilevel0, ilevel1, lod_fpart,
-                             packed_lo, packed_hi);
+                             packed_var);
    }
    else {
       /* Emit conditional to choose min image filter or mag image filter
@@ -1722,7 +1638,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                                 min_filter, mip_filter,
                                 s, t, r, offsets,
                                 ilevel0, ilevel1, lod_fpart,
-                                packed_lo, packed_hi);
+                                packed_var);
       }
       lp_build_else(&if_ctx);
       {
@@ -1731,19 +1647,12 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                                 mag_filter, PIPE_TEX_MIPFILTER_NONE,
                                 s, t, r, offsets,
                                 ilevel0, NULL, NULL,
-                                packed_lo, packed_hi);
+                                packed_var);
       }
       lp_build_endif(&if_ctx);
    }
 
-   /*
-    * combine the values stored in 'packed_lo' and 'packed_hi' variables
-    * into 'packed'
-    */
-   packed = lp_build_pack2(bld->gallivm,
-                           h16_bld.type, lp_type_unorm(8, bld->vector_width),
-                           LLVMBuildLoad(builder, packed_lo, ""),
-                           LLVMBuildLoad(builder, packed_hi, ""));
+   packed = LLVMBuildLoad(builder, packed_var, "");
 
    /*
     * Convert to SoA and swizzle.
index beefdaed513fef616e9ab05d364af5a809ba08bc..cc29c5c885facd97bc70b8244892c7892d90f05d 100644 (file)
@@ -822,7 +822,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       for (chan = 0; chan < 4; chan++) {
          colors_out[chan] = lp_build_lerp(&bld->texel_bld, s_fpart,
                                           neighbors[0][0][chan],
-                                          neighbors[0][1][chan]);
+                                          neighbors[0][1][chan],
+                                          0);
       }
    }
    else {
@@ -848,7 +849,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                           neighbors[0][0][chan],
                                           neighbors[0][1][chan],
                                           neighbors[1][0][chan],
-                                          neighbors[1][1][chan]);
+                                          neighbors[1][1][chan],
+                                          0);
       }
 
       if (dims == 3) {
@@ -884,14 +886,16 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                              neighbors1[0][0][chan],
                                              neighbors1[0][1][chan],
                                              neighbors1[1][0][chan],
-                                             neighbors1[1][1][chan]);
+                                             neighbors1[1][1][chan],
+                                             0);
          }
 
          /* Linearly interpolate the two samples from the two 3D slices */
          for (chan = 0; chan < 4; chan++) {
             colors_out[chan] = lp_build_lerp(&bld->texel_bld,
                                              r_fpart,
-                                             colors0[chan], colors1[chan]);
+                                             colors0[chan], colors1[chan],
+                                             0);
          }
       }
       else {
@@ -1038,7 +1042,8 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
          for (chan = 0; chan < 4; chan++) {
             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
-                                          colors0[chan], colors1[chan]);
+                                          colors0[chan], colors1[chan],
+                                          0);
             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
          }
       }
index 75309dd5fcd4aea9299d9bc7598219d08a291b4e..1dab28cedfbbc87f18ecb1834c4abcebef3b596f 100644 (file)
@@ -146,9 +146,9 @@ lp_build_blend(struct lp_build_context *bld,
       if (lp_build_blend_factor_complementary(factor_src, factor_dst)) {
          if (func == PIPE_BLEND_ADD) {
             if (factor_src < factor_dst) {
-               return lp_build_lerp(bld, src_factor, dst, src);
+               return lp_build_lerp(bld, src_factor, dst, src, 0);
             } else {
-               return lp_build_lerp(bld, dst_factor, src, dst);
+               return lp_build_lerp(bld, dst_factor, src, dst, 0);
             }
          } else if(bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
             result = lp_build_add(bld, src, dst);