i965/blorp: Stop doing f2i(i2f(sample_id))

[mesa.git] / src / mesa / drivers / dri / i965 / brw_blorp_blit.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp

index 4bd427f904203f1a85439c2550f4830a0b014355..26b5cbff663259d6ffa79f6e00587fa71a183606 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -416,15 +416,7 @@ blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,
     nir_ssa_def *mul = nir_vec2(b, nir_load_var(b, v->u_x_transform.multiplier),
                                    nir_load_var(b, v->u_y_transform.multiplier));
  
-   nir_ssa_def *pos = nir_ffma(b, src_pos, mul, offset);
-
-   if (src_pos->num_components == 3) {
-      /* Leave the sample id alone */
-      pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),
-                        nir_channel(b, src_pos, 2));
-   }
-
-   return pos;
+   return nir_ffma(b, src_pos, mul, offset);
  }
  
  static inline void
@@ -685,6 +677,422 @@ blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)
     return nir_vec2(b, x_Y, y_Y);
  }
  
+/**
+ * Emit code to compensate for the difference between MSAA and non-MSAA
+ * surfaces.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ *   (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
+ *
+ * (See brw_blorp_blit_program).
+ */
+static inline nir_ssa_def *
+blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,
+                      unsigned num_samples, enum intel_msaa_layout layout)
+{
+   assert(pos->num_components == 2 || pos->num_components == 3);
+
+   switch (layout) {
+   case INTEL_MSAA_LAYOUT_NONE:
+      assert(pos->num_components == 2);
+      return pos;
+   case INTEL_MSAA_LAYOUT_CMS:
+      /* We can't compensate for compressed layout since at this point in the
+       * program we haven't read from the MCS buffer.
+       */
+      unreachable("Bad layout in encode_msaa");
+   case INTEL_MSAA_LAYOUT_UMS:
+      /* No translation needed */
+      return pos;
+   case INTEL_MSAA_LAYOUT_IMS: {
+      nir_ssa_def *x_in = nir_channel(b, pos, 0);
+      nir_ssa_def *y_in = nir_channel(b, pos, 1);
+      nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :
+                                                     nir_channel(b, pos, 2);
+
+      nir_ssa_def *x_out = nir_imm_int(b, 0);
+      nir_ssa_def *y_out = nir_imm_int(b, 0);
+      switch (num_samples) {
+      case 2:
+      case 4:
+         /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
+          *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+          *         Y' = Y
+          *
+          * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
+          *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+          *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
+          */
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);
+         x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+         if (num_samples == 2) {
+            y_out = y_in;
+         } else {
+            y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
+            y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
+            y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+         }
+         break;
+
+      case 8:
+         /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
+          *   where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
+          *              | (X & 0b1)
+          *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
+          */
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
+         x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
+         x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+         y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
+         y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
+         y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+         break;
+
+      default:
+         unreachable("Invalid number of samples for IMS layout");
+      }
+
+      return nir_vec2(b, x_out, y_out);
+   }
+
+   default:
+      unreachable("Invalid MSAA layout");
+   }
+}
+
+/**
+ * Emit code to compensate for the difference between MSAA and non-MSAA
+ * surfaces.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ *   (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
+ *
+ * (See brw_blorp_blit_program).
+ */
+static inline nir_ssa_def *
+blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,
+                      unsigned num_samples, enum intel_msaa_layout layout)
+{
+   assert(pos->num_components == 2 || pos->num_components == 3);
+
+   switch (layout) {
+   case INTEL_MSAA_LAYOUT_NONE:
+      /* No translation necessary, and S should already be zero. */
+      assert(pos->num_components == 2);
+      return pos;
+   case INTEL_MSAA_LAYOUT_CMS:
+      /* We can't compensate for compressed layout since at this point in the
+       * program we don't have access to the MCS buffer.
+       */
+      unreachable("Bad layout in encode_msaa");
+   case INTEL_MSAA_LAYOUT_UMS:
+      /* No translation necessary. */
+      return pos;
+   case INTEL_MSAA_LAYOUT_IMS: {
+      assert(pos->num_components == 2);
+
+      nir_ssa_def *x_in = nir_channel(b, pos, 0);
+      nir_ssa_def *y_in = nir_channel(b, pos, 1);
+
+      nir_ssa_def *x_out = nir_imm_int(b, 0);
+      nir_ssa_def *y_out = nir_imm_int(b, 0);
+      nir_ssa_def *s_out = nir_imm_int(b, 0);
+      switch (num_samples) {
+      case 2:
+      case 4:
+         /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
+          *   where X' = (X & ~0b11) >> 1 | (X & 0b1)
+          *         S = (X & 0b10) >> 1
+          *
+          * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
+          *   where X' = (X & ~0b11) >> 1 | (X & 0b1)
+          *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+          *         S = (Y & 0b10) | (X & 0b10) >> 1
+          */
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+         if (num_samples == 2) {
+            y_out = y_in;
+            s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+         } else {
+            y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
+            y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+            s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+            s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
+         }
+         break;
+
+      case 8:
+         /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
+          *   where X' = (X & ~0b111) >> 2 | (X & 0b1)
+          *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+          *         S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
+          */
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
+         x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
+         y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
+         y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
+         s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
+         s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
+         s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
+         break;
+
+      default:
+         unreachable("Invalid number of samples for IMS layout");
+      }
+
+      return nir_vec3(b, x_out, y_out, s_out);
+   }
+
+   default:
+      unreachable("Invalid MSAA layout");
+   }
+}
+
+/**
+ * Count the number of trailing 1 bits in the given value.  For example:
+ *
+ * count_trailing_one_bits(0) == 0
+ * count_trailing_one_bits(7) == 3
+ * count_trailing_one_bits(11) == 2
+ */
+static inline int count_trailing_one_bits(unsigned value)
+{
+#ifdef HAVE___BUILTIN_CTZ
+   return __builtin_ctz(~value);
+#else
+   return _mesa_bitcount(value & ~(value + 1));
+#endif
+}
+
+static nir_ssa_def *
+blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos,
+                               unsigned tex_samples,
+                               enum intel_msaa_layout tex_layout,
+                               enum brw_reg_type dst_type)
+{
+   /* If non-null, this is the outer-most if statement */
+   nir_if *outer_if = NULL;
+
+   nir_variable *color =
+      nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
+
+   nir_ssa_def *mcs = NULL;
+   if (tex_layout == INTEL_MSAA_LAYOUT_CMS)
+      mcs = blorp_nir_txf_ms_mcs(b, pos);
+
+   /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
+    *
+    *   result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
+    *
+    * This ensures that when all samples have the same value, no numerical
+    * precision is lost, since each addition operation always adds two equal
+    * values, and summing two equal floating point values does not lose
+    * precision.
+    *
+    * We perform this computation by treating the texture_data array as a
+    * stack and performing the following operations:
+    *
+    * - push sample 0 onto stack
+    * - push sample 1 onto stack
+    * - add top two stack entries
+    * - push sample 2 onto stack
+    * - push sample 3 onto stack
+    * - add top two stack entries
+    * - add top two stack entries
+    * - divide top stack entry by 4
+    *
+    * Note that after pushing sample i onto the stack, the number of add
+    * operations we do is equal to the number of trailing 1 bits in i.  This
+    * works provided the total number of samples is a power of two, which it
+    * always is for i965.
+    *
+    * For integer formats, we replace the add operations with average
+    * operations and skip the final division.
+    */
+   nir_ssa_def *texture_data[4];
+   unsigned stack_depth = 0;
+   for (unsigned i = 0; i < tex_samples; ++i) {
+      assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
+
+      /* Push sample i onto the stack */
+      assert(stack_depth < ARRAY_SIZE(texture_data));
+
+      nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),
+                                        nir_channel(b, pos, 1),
+                                        nir_imm_int(b, i));
+      texture_data[stack_depth++] = blorp_nir_txf_ms(b, ms_pos, mcs, dst_type);
+
+      if (i == 0 && tex_layout == INTEL_MSAA_LAYOUT_CMS) {
+         /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
+          * suggests an optimization:
+          *
+          *     "A simple optimization with probable large return in
+          *     performance is to compare the MCS value to zero (indicating
+          *     all samples are on sample slice 0), and sample only from
+          *     sample slice 0 using ld2dss if MCS is zero."
+          *
+          * Note that in the case where the MCS value is zero, sampling from
+          * sample slice 0 using ld2dss and sampling from sample 0 using
+          * ld2dms are equivalent (since all samples are on sample slice 0).
+          * Since we have already sampled from sample 0, all we need to do is
+          * skip the remaining fetches and averaging if MCS is zero.
+          */
+         nir_ssa_def *mcs_zero =
+            nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0));
+         nir_if *if_stmt = nir_if_create(b->shader);
+         if_stmt->condition = nir_src_for_ssa(mcs_zero);
+         nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+         b->cursor = nir_after_cf_list(&if_stmt->then_list);
+         nir_store_var(b, color, texture_data[0], 0xf);
+
+         b->cursor = nir_after_cf_list(&if_stmt->else_list);
+         outer_if = if_stmt;
+      }
+
+      for (int j = 0; j < count_trailing_one_bits(i); j++) {
+         assert(stack_depth >= 2);
+         --stack_depth;
+
+         assert(dst_type == BRW_REGISTER_TYPE_F);
+         texture_data[stack_depth - 1] =
+            nir_fadd(b, texture_data[stack_depth - 1],
+                        texture_data[stack_depth]);
+      }
+   }
+
+   /* We should have just 1 sample on the stack now. */
+   assert(stack_depth == 1);
+
+   texture_data[0] = nir_fmul(b, texture_data[0],
+                              nir_imm_float(b, 1.0 / tex_samples));
+
+   nir_store_var(b, color, texture_data[0], 0xf);
+
+   if (outer_if)
+      b->cursor = nir_after_cf_node(&outer_if->cf_node);
+
+   return nir_load_var(b, color);
+}
+
+static inline nir_ssa_def *
+nir_imm_vec2(nir_builder *build, float x, float y)
+{
+   nir_const_value v;
+
+   memset(&v, 0, sizeof(v));
+   v.f32[0] = x;
+   v.f32[1] = y;
+
+   return nir_build_imm(build, 4, 32, v);
+}
+
+static nir_ssa_def *
+blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
+                                unsigned tex_samples,
+                                const brw_blorp_blit_prog_key *key,
+                                struct brw_blorp_blit_vars *v)
+{
+   nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);
+
+   nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);
+
+   /* Translate coordinates to lay out the samples in a rectangular  grid
+    * roughly corresponding to sample locations.
+    */
+   pos_xy = nir_fmul(b, pos_xy, scale);
+   /* Adjust coordinates so that integers represent pixel centers rather
+    * than pixel edges.
+    */
+   pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));
+   /* Clamp the X, Y texture coordinates to properly handle the sampling of
+    * texels on texture edges.
+    */
+   pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),
+                        nir_vec2(b, nir_load_var(b, v->u_rect_grid_x1),
+                                    nir_load_var(b, v->u_rect_grid_y1)));
+
+   /* Store the fractional parts to be used as bilinear interpolation
+    * coefficients.
+    */
+   nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);
+   /* Round the float coordinates down to nearest integer */
+   pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);
+
+   nir_ssa_def *tex_data[4];
+   for (unsigned i = 0; i < 4; ++i) {
+      float sample_off_x = (float)(i & 0x1) / key->x_scale;
+      float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;
+      nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);
+
+      nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);
+      nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords);
+
+      /* The MCS value we fetch has to match up with the pixel that we're
+       * sampling from. Since we sample from different pixels in each
+       * iteration of this "for" loop, the call to mcs_fetch() should be
+       * here inside the loop after computing the pixel coordinates.
+       */
+      nir_ssa_def *mcs = NULL;
+      if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
+         mcs = blorp_nir_txf_ms_mcs(b, sample_coords_int);
+
+      /* Compute sample index and map the sample index to a sample number.
+       * Sample index layout shows the numbering of slots in a rectangular
+       * grid of samples with in a pixel. Sample number layout shows the
+       * rectangular grid of samples roughly corresponding to the real sample
+       * locations with in a pixel.
+       * In case of 4x MSAA, layout of sample indices matches the layout of
+       * sample numbers:
+       *           ---------
+       *           | 0 | 1 |
+       *           ---------
+       *           | 2 | 3 |
+       *           ---------
+       *
+       * In case of 8x MSAA the two layouts don't match.
+       * sample index layout :  ---------    sample number layout :  ---------
+       *                        | 0 | 1 |                            | 5 | 2 |
+       *                        ---------                            ---------
+       *                        | 2 | 3 |                            | 4 | 6 |
+       *                        ---------                            ---------
+       *                        | 4 | 5 |                            | 0 | 3 |
+       *                        ---------                            ---------
+       *                        | 6 | 7 |                            | 7 | 1 |
+       *                        ---------                            ---------
+       *
+       * Fortunately, this can be done fairly easily as:
+       * S' = (0x17306425 >> (S * 4)) & 0xf
+       */
+      nir_ssa_def *frac = nir_ffract(b, sample_coords);
+      nir_ssa_def *sample =
+         nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,
+                                            key->x_scale * key->y_scale));
+      sample = nir_f2i(b, sample);
+
+      if (tex_samples == 8) {
+         sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x17306425),
+                                       nir_ishl(b, sample, nir_imm_int(b, 2))),
+                           nir_imm_int(b, 0xf));
+      }
+      nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),
+                                        nir_channel(b, sample_coords_int, 1),
+                                        sample);
+      tex_data[i] = blorp_nir_txf_ms(b, pos_ms, mcs, key->texture_data_type);
+   }
+
+   nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);
+   nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);
+   return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),
+                      nir_flrp(b, tex_data[2], tex_data[3], frac_x),
+                      frac_y);
+}
+
  /**
   * Generator for WM programs used in BLORP blits.
   *
@@ -900,12 +1308,14 @@ brw_blorp_build_nir_shader(struct brw_context *brw,
     if (rt_tiled_w != key->dst_tiled_w ||
         key->rt_samples != key->dst_samples ||
         key->rt_layout != key->dst_layout) {
-      if (key->rt_samples != key->dst_samples ||
-          key->rt_layout != key->dst_layout ||
-          key->rt_samples != 0)
-         goto fail;
+      dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,
+                                      key->rt_layout);
+      /* Now (X, Y, S) = detile(rt_tiling, offset) */
        if (rt_tiled_w != key->dst_tiled_w)
           dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);
+      /* Now (X, Y, S) = detile(rt_tiling, offset) */
+      dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,
+                                      key->dst_layout);
     }
  
     /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
@@ -920,49 +1330,89 @@ brw_blorp_build_nir_shader(struct brw_context *brw,
        blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
  
     src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v);
-
-   if (key->blit_scaled && key->blend) {
-      goto fail;
-   } else if (!key->bilinear_filter) {
-      /* We're going to use a texelFetch, so we need integers */
-      src_pos = nir_f2i(&b, src_pos);
+   if (dst_pos->num_components == 3) {
+      /* The sample coordinate is an integer that we want left alone but
+       * blorp_blit_apply_transform() blindly applies the transform to all
+       * three coordinates.  Grab the original sample index.
+       */
+      src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),
+                             nir_channel(&b, src_pos, 1),
+                             nir_channel(&b, dst_pos, 2));
     }
  
+   /* If the source image is not multisampled, then we want to fetch sample
+    * number 0, because that's the only sample there is.
+    */
+   if (key->src_samples == 0)
+      src_pos = nir_channels(&b, src_pos, 0x3);
+
     /* X, Y, and S are now the coordinates of the pixel in the source image
      * that we want to texture from.  Exception: if we are blending, then S is
      * irrelevant, because we are going to fetch all samples.
      */
     if (key->blend && !key->blit_scaled) {
-      goto fail;
-   } else if (key->blend && key->blit_scaled) {
-      goto fail;
-   } else {
-      /* We aren't blending, which means we just want to fetch a single sample
-       * from the source surface.  The address that we want to fetch from is
-       * related to the X, Y and S values according to the formula:
-       *
-       * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
-       *
-       * If the actual tiling and sample count of the source surface are not
-       * the same as the configuration of the texture, then we need to adjust
-       * the coordinates to compensate for the difference.
+      /* Resolves (effecively) use texelFetch, so we need integers and we
+       * don't care about the sample index if we got one.
         */
-      if ((tex_tiled_w != key->src_tiled_w ||
-           key->tex_samples != key->src_samples ||
-           key->tex_layout != key->src_layout) &&
-          !key->bilinear_filter) {
-         if (key->tex_samples != key->src_samples ||
-             key->tex_layout != key->src_layout ||
-             key->tex_samples != 0)
-            goto fail;
+      src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3));
  
-         if (tex_tiled_w != key->src_tiled_w)
-            src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
+      if (brw->gen == 6) {
+         /* Because gen6 only supports 4x interleved MSAA, we can do all the
+          * blending we need with a single linear-interpolated texture lookup
+          * at the center of the sample. The texture coordinates to be odd
+          * integers so that they correspond to the center of a 2x2 block
+          * representing the four samples that maxe up a pixel.  So we need
+          * to multiply our X and Y coordinates each by 2 and then add 1.
+          */
+         src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
+         src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
+         src_pos = nir_i2f(&b, src_pos);
+         color = blorp_nir_tex(&b, src_pos, key->texture_data_type);
+      } else {
+         /* Gen7+ hardware doesn't automaticaly blend. */
+         color = blorp_nir_manual_blend_average(&b, src_pos, key->src_samples,
+                                                key->src_layout,
+                                                key->texture_data_type);
        }
-
+   } else if (key->blend && key->blit_scaled) {
+      color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
+   } else {
        if (key->bilinear_filter) {
           color = blorp_nir_tex(&b, src_pos, key->texture_data_type);
        } else {
+         /* We're going to use texelFetch, so we need integers */
+         if (src_pos->num_components == 2) {
+            src_pos = nir_f2i(&b, src_pos);
+         } else {
+            assert(src_pos->num_components == 3);
+            src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0),
+                                   nir_channel(&b, nir_f2i(&b, src_pos), 1),
+                                   nir_channel(&b, src_pos, 2));
+         }
+
+         /* We aren't blending, which means we just want to fetch a single
+          * sample from the source surface.  The address that we want to fetch
+          * from is related to the X, Y and S values according to the formula:
+          *
+          * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
+          *
+          * If the actual tiling and sample count of the source surface are
+          * not the same as the configuration of the texture, then we need to
+          * adjust the coordinates to compensate for the difference.
+          */
+         if (tex_tiled_w != key->src_tiled_w ||
+             key->tex_samples != key->src_samples ||
+             key->tex_layout != key->src_layout) {
+            src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
+                                            key->src_layout);
+            /* Now (X, Y, S) = detile(src_tiling, offset) */
+            if (tex_tiled_w != key->src_tiled_w)
+               src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
+            /* Now (X, Y, S) = detile(tex_tiling, offset) */
+            src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
+                                            key->tex_layout);
+         }
+
           /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
            *
            * In other words: X, Y, and S now contain values which, when passed to
@@ -984,10 +1434,6 @@ brw_blorp_build_nir_shader(struct brw_context *brw,
     nir_store_var(&b, v.color_out, color, 0xf);
  
     return b.shader;
-
-fail:
-   ralloc_free(b.shader);
-   return NULL;
  }
  
  class brw_blorp_blit_program : public brw_blorp_eu_emitter
@@ -1828,23 +2274,6 @@ brw_blorp_blit_program::clamp_tex_coords(struct brw_reg regX,
  
  
  
-/**
- * Count the number of trailing 1 bits in the given value.  For example:
- *
- * count_trailing_one_bits(0) == 0
- * count_trailing_one_bits(7) == 3
- * count_trailing_one_bits(11) == 2
- */
-static inline int count_trailing_one_bits(unsigned value)
-{
-#ifdef HAVE___BUILTIN_CTZ
-   return __builtin_ctz(~value);
-#else
-   return _mesa_bitcount(value & ~(value + 1));
-#endif
-}
-
-
  void
  brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
  {