i965: Don't call the blitter on addresses it can't handle.
[mesa.git] / src / mesa / drivers / dri / i965 / intel_mipmap_tree.c
index 058cfbe1bcccfe790f965543f38e2dc22ed71ee3..0818226f3c4a2ead5688fcae5c7f98b90a8c3f4a 100644 (file)
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include <GL/gl.h>
 
 #include "intel_batchbuffer.h"
 #include "intel_chipset.h"
-#include "intel_context.h"
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
 #include "intel_resolve_map.h"
-#include "intel_tex_layout.h"
 #include "intel_tex.h"
 #include "intel_blit.h"
 
-#ifndef I915
 #include "brw_blorp.h"
-#endif
+#include "brw_context.h"
 
 #include "main/enums.h"
 #include "main/formats.h"
 #include "main/glformats.h"
 #include "main/texcompress_etc.h"
 #include "main/teximage.h"
+#include "main/streaming-load-memcpy.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
@@ -72,10 +70,10 @@ target_to_target(GLenum target)
  * created, based on the chip generation and the surface type.
  */
 static enum intel_msaa_layout
-compute_msaa_layout(struct intel_context *intel, gl_format format, GLenum target)
+compute_msaa_layout(struct brw_context *brw, gl_format format, GLenum target)
 {
    /* Prior to Gen7, all MSAA surfaces used IMS layout. */
-   if (intel->gen < 7)
+   if (brw->gen < 7)
       return INTEL_MSAA_LAYOUT_IMS;
 
    /* In Gen7, IMS layout is only used for depth and stencil buffers. */
@@ -98,26 +96,10 @@ compute_msaa_layout(struct intel_context *intel, gl_format format, GLenum target
        */
       if (_mesa_get_format_datatype(format) == GL_INT) {
          /* TODO: is this workaround needed for future chipsets? */
-         assert(intel->gen == 7);
+         assert(brw->gen == 7);
          return INTEL_MSAA_LAYOUT_UMS;
       } else {
-         /* For now, if we're going to be texturing from this surface,
-          * force UMS, so that the shader doesn't have to do different things
-          * based on whether there's a multisample control surface needing sampled first.
-          * We can't just blindly read the MCS surface in all cases because:
-          *
-          * From the Ivy Bridge PRM, Vol4 Part1 p77 ("MCS Enable"):
-          *
-          *    If this field is disabled and the sampling engine <ld_mcs> message
-          *    is issued on this surface, the MCS surface may be accessed. Software
-          *    must ensure that the surface is defined to avoid GTT errors.
-          */
-         if (target == GL_TEXTURE_2D_MULTISAMPLE ||
-             target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY) {
-            return INTEL_MSAA_LAYOUT_UMS;
-         } else {
-            return INTEL_MSAA_LAYOUT_CMS;
-         }
+         return INTEL_MSAA_LAYOUT_CMS;
       }
    }
 }
@@ -165,7 +147,7 @@ compute_msaa_layout(struct intel_context *intel, gl_format format, GLenum target
  *   by half the block width, and Y coordinates by half the block height.
  */
 void
-intel_get_non_msrt_mcs_alignment(struct intel_context *intel,
+intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
                                  unsigned *width_px, unsigned *height)
 {
@@ -200,17 +182,11 @@ intel_get_non_msrt_mcs_alignment(struct intel_context *intel,
  *       64bpp, and 128bpp.
  */
 bool
-intel_is_non_msrt_mcs_buffer_supported(struct intel_context *intel,
+intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
                                        struct intel_mipmap_tree *mt)
 {
-#ifdef I915
-   /* MCS is not supported on the i915 (pre-Gen4) driver */
-   return false;
-#else
-   struct brw_context *brw = brw_context(&intel->ctx);
-
    /* MCS support does not exist prior to Gen7 */
-   if (intel->gen < 7)
+   if (brw->gen < 7 || brw->gen >= 8)
       return false;
 
    /* MCS is only supported for color buffers */
@@ -238,7 +214,6 @@ intel_is_non_msrt_mcs_buffer_supported(struct intel_context *intel,
       return false;
 
    return true;
-#endif
 }
 
 
@@ -248,7 +223,7 @@ intel_is_non_msrt_mcs_buffer_supported(struct intel_context *intel,
  *        \c stencil_mt.
  */
 struct intel_mipmap_tree *
-intel_miptree_create_layout(struct intel_context *intel,
+intel_miptree_create_layout(struct brw_context *brw,
                             GLenum target,
                             gl_format format,
                             GLuint first_level,
@@ -275,9 +250,7 @@ intel_miptree_create_layout(struct intel_context *intel,
    mt->logical_width0 = width0;
    mt->logical_height0 = height0;
    mt->logical_depth0 = depth0;
-#ifndef I915
-   mt->mcs_state = INTEL_MCS_STATE_NONE;
-#endif
+   mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS;
 
    /* The cpp is bytes per (1, blockheight)-sized block for compressed
     * textures.  This is why you'll see divides by blockheight all over
@@ -290,11 +263,11 @@ intel_miptree_create_layout(struct intel_context *intel,
    mt->num_samples = num_samples;
    mt->compressed = _mesa_is_format_compressed(format);
    mt->msaa_layout = INTEL_MSAA_LAYOUT_NONE;
-   mt->refcount = 1; 
+   mt->refcount = 1;
 
    if (num_samples > 1) {
       /* Adjust width/height/depth for MSAA */
-      mt->msaa_layout = compute_msaa_layout(intel, format, mt->target);
+      mt->msaa_layout = compute_msaa_layout(brw, format, mt->target);
       if (mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS) {
          /* In the Sandy Bridge PRM, volume 4, part 1, page 31, it says:
           *
@@ -376,10 +349,9 @@ intel_miptree_create_layout(struct intel_context *intel,
 
    if (!for_bo &&
        _mesa_get_format_base_format(format) == GL_DEPTH_STENCIL &&
-       (intel->must_use_separate_stencil ||
-       (intel->has_separate_stencil &&
-        intel->vtbl.is_hiz_depth_format(intel, format)))) {
-      mt->stencil_mt = intel_miptree_create(intel,
+       (brw->must_use_separate_stencil ||
+       (brw->has_separate_stencil && brw_is_hiz_depth_format(brw, format)))) {
+      mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
                                             MESA_FORMAT_S8,
                                             mt->first_level,
@@ -409,18 +381,7 @@ intel_miptree_create_layout(struct intel_context *intel,
       }
    }
 
-   intel_get_texture_alignment_unit(intel, mt->format,
-                                   &mt->align_w, &mt->align_h);
-
-#ifdef I915
-   (void) intel;
-   if (intel->is_945)
-      i945_miptree_layout(mt);
-   else
-      i915_miptree_layout(mt);
-#else
-   brw_miptree_layout(intel, mt);
-#endif
+   brw_miptree_layout(brw, mt);
 
    return mt;
 }
@@ -429,14 +390,13 @@ intel_miptree_create_layout(struct intel_context *intel,
  * \brief Helper function for intel_miptree_create().
  */
 static uint32_t
-intel_miptree_choose_tiling(struct intel_context *intel,
+intel_miptree_choose_tiling(struct brw_context *brw,
                             gl_format format,
                             uint32_t width0,
                             uint32_t num_samples,
                             enum intel_miptree_tiling_mode requested,
                             struct intel_mipmap_tree *mt)
 {
-
    if (format == MESA_FORMAT_S8) {
       /* The stencil buffer is W tiled. However, we request from the kernel a
        * non-tiled buffer because the GTT is incapable of W fencing.
@@ -473,9 +433,8 @@ intel_miptree_choose_tiling(struct intel_context *intel,
    }
 
    GLenum base_format = _mesa_get_format_base_format(format);
-   if (intel->gen >= 4 &&
-       (base_format == GL_DEPTH_COMPONENT ||
-        base_format == GL_DEPTH_STENCIL_EXT))
+   if (base_format == GL_DEPTH_COMPONENT ||
+       base_format == GL_DEPTH_STENCIL_EXT)
       return I915_TILING_Y;
 
    int minimum_pitch = mt->total_width * mt->cpp;
@@ -484,21 +443,48 @@ intel_miptree_choose_tiling(struct intel_context *intel,
    if (minimum_pitch < 64)
       return I915_TILING_NONE;
 
-   if (ALIGN(minimum_pitch, 512) >= 32768) {
+   if (ALIGN(minimum_pitch, 512) >= 32768 ||
+       mt->total_width >= 32768 || mt->total_height >= 32768) {
       perf_debug("%dx%d miptree too large to blit, falling back to untiled",
                  mt->total_width, mt->total_height);
       return I915_TILING_NONE;
    }
 
    /* Pre-gen6 doesn't have BLORP to handle Y-tiling, so use X-tiling. */
-   if (intel->gen < 6)
+   if (brw->gen < 6)
       return I915_TILING_X;
 
+   /* From the Sandybridge PRM, Volume 1, Part 2, page 32:
+    * "NOTE: 128BPE Format Color Buffer ( render target ) MUST be either TileX
+    *  or Linear."
+    * 128 bits per pixel translates to 16 bytes per pixel.  This is necessary
+    * all the way back to 965, but is explicitly permitted on Gen7.
+    */
+   if (brw->gen != 7 && mt->cpp >= 16)
+      return I915_TILING_X;
+
+   /* From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
+    * messages), on p64, under the heading "Surface Vertical Alignment":
+    *
+    *     This field must be set to VALIGN_4 for all tiled Y Render Target
+    *     surfaces.
+    *
+    * So if the surface is renderable and uses a vertical alignment of 2,
+    * force it to be X tiled.  This is somewhat conservative (it's possible
+    * that the client won't ever render to this surface), but it's difficult
+    * to know that ahead of time.  And besides, since we use a vertical
+    * alignment of 4 as often as we can, this shouldn't happen very often.
+    */
+   if (brw->gen == 7 && mt->align_h == 2 &&
+       brw->format_supported_as_render_target[format]) {
+      return I915_TILING_X;
+   }
+
    return I915_TILING_Y | I915_TILING_X;
 }
 
 struct intel_mipmap_tree *
-intel_miptree_create(struct intel_context *intel,
+intel_miptree_create(struct brw_context *brw,
                     GLenum target,
                     gl_format format,
                     GLuint first_level,
@@ -515,7 +501,7 @@ intel_miptree_create(struct intel_context *intel,
    gl_format etc_format = MESA_FORMAT_NONE;
    GLuint total_width, total_height;
 
-   if (!intel->is_baytrail) {
+   if (!brw->is_baytrail) {
       switch (format) {
       case MESA_FORMAT_ETC1_RGB8:
          format = MESA_FORMAT_RGBX8888_REV;
@@ -552,7 +538,7 @@ intel_miptree_create(struct intel_context *intel,
 
    etc_format = (format != tex_format) ? tex_format : MESA_FORMAT_NONE;
 
-   mt = intel_miptree_create_layout(intel, target, format,
+   mt = intel_miptree_create_layout(brw, target, format,
                                      first_level, last_level, width0,
                                      height0, depth0,
                                      false, num_samples);
@@ -573,13 +559,13 @@ intel_miptree_create(struct intel_context *intel,
       total_height = ALIGN(total_height, 64);
    }
 
-   uint32_t tiling = intel_miptree_choose_tiling(intel, format, width0,
+   uint32_t tiling = intel_miptree_choose_tiling(brw, format, width0,
                                                  num_samples, requested_tiling,
                                                  mt);
    bool y_or_x = tiling == (I915_TILING_Y | I915_TILING_X);
 
    mt->etc_format = etc_format;
-   mt->region = intel_region_alloc(intel->intelScreen,
+   mt->region = intel_region_alloc(brw->intelScreen,
                                   y_or_x ? I915_TILING_Y : tiling,
                                   mt->cpp,
                                   total_width,
@@ -590,12 +576,12 @@ intel_miptree_create(struct intel_context *intel,
     * BLT engine to support it.  The BLT paths can't currently handle Y-tiling,
     * so we need to fall back to X.
     */
-   if (y_or_x && mt->region->bo->size >= intel->max_gtt_map_object_size) {
+   if (y_or_x && mt->region->bo->size >= brw->max_gtt_map_object_size) {
       perf_debug("%dx%d miptree larger than aperture; falling back to X-tiled\n",
                  mt->total_width, mt->total_height);
       intel_region_release(&mt->region);
 
-      mt->region = intel_region_alloc(intel->intelScreen,
+      mt->region = intel_region_alloc(brw->intelScreen,
                                       I915_TILING_X,
                                       mt->cpp,
                                       total_width,
@@ -610,21 +596,27 @@ intel_miptree_create(struct intel_context *intel,
        return NULL;
    }
 
-#ifndef I915
+
+   if (mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
+      if (!intel_miptree_alloc_mcs(brw, mt, num_samples)) {
+         intel_miptree_release(&mt);
+         return NULL;
+      }
+   }
+
    /* If this miptree is capable of supporting fast color clears, set
-    * mcs_state appropriately to ensure that fast clears will occur.
+    * fast_clear_state appropriately to ensure that fast clears will occur.
     * Allocation of the MCS miptree will be deferred until the first fast
     * clear actually occurs.
     */
-   if (intel_is_non_msrt_mcs_buffer_supported(intel, mt))
-      mt->mcs_state = INTEL_MCS_STATE_RESOLVED;
-#endif
+   if (intel_is_non_msrt_mcs_buffer_supported(brw, mt))
+      mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
 
    return mt;
 }
 
 struct intel_mipmap_tree *
-intel_miptree_create_for_bo(struct intel_context *intel,
+intel_miptree_create_for_bo(struct brw_context *brw,
                             drm_intel_bo *bo,
                             gl_format format,
                             uint32_t offset,
@@ -650,12 +642,14 @@ intel_miptree_create_for_bo(struct intel_context *intel,
     */
    assert(pitch >= 0);
 
-   mt = intel_miptree_create_layout(intel, GL_TEXTURE_2D, format,
+   mt = intel_miptree_create_layout(brw, GL_TEXTURE_2D, format,
                                     0, 0,
                                     width, height, 1,
                                     true, 0 /* num_samples */);
-   if (!mt)
+   if (!mt) {
+      free(region);
       return mt;
+   }
 
    region->cpp = mt->cpp;
    region->width = width;
@@ -681,7 +675,7 @@ intel_miptree_create_for_bo(struct intel_context *intel,
  * singlesample miptree is embedded as a child.
  */
 struct intel_mipmap_tree*
-intel_miptree_create_for_dri2_buffer(struct intel_context *intel,
+intel_miptree_create_for_dri2_buffer(struct brw_context *brw,
                                      unsigned dri_attachment,
                                      gl_format format,
                                      uint32_t num_samples,
@@ -699,7 +693,7 @@ intel_miptree_create_for_dri2_buffer(struct intel_context *intel,
    assert(_mesa_get_format_base_format(format) == GL_RGB ||
           _mesa_get_format_base_format(format) == GL_RGBA);
 
-   singlesample_mt = intel_miptree_create_for_bo(intel,
+   singlesample_mt = intel_miptree_create_for_bo(brw,
                                                  region->bo,
                                                  format,
                                                  0,
@@ -711,15 +705,80 @@ intel_miptree_create_for_dri2_buffer(struct intel_context *intel,
       return NULL;
    singlesample_mt->region->name = region->name;
 
-#ifndef I915
+   /* If this miptree is capable of supporting fast color clears, set
+    * fast_clear_state appropriately to ensure that fast clears will occur.
+    * Allocation of the MCS miptree will be deferred until the first fast
+    * clear actually occurs.
+    */
+   if (intel_is_non_msrt_mcs_buffer_supported(brw, singlesample_mt))
+      singlesample_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
+
+   if (num_samples == 0)
+      return singlesample_mt;
+
+   multisample_mt = intel_miptree_create_for_renderbuffer(brw,
+                                                          format,
+                                                          region->width,
+                                                          region->height,
+                                                          num_samples);
+   if (!multisample_mt) {
+      intel_miptree_release(&singlesample_mt);
+      return NULL;
+   }
+
+   multisample_mt->singlesample_mt = singlesample_mt;
+   multisample_mt->need_downsample = false;
+
+   if (brw->is_front_buffer_rendering &&
+       (dri_attachment == __DRI_BUFFER_FRONT_LEFT ||
+        dri_attachment == __DRI_BUFFER_FAKE_FRONT_LEFT)) {
+      intel_miptree_upsample(brw, multisample_mt);
+   }
+
+   return multisample_mt;
+}
+
+/**
+ * For a singlesample image buffer, this simply wraps the given region with a miptree.
+ *
+ * For a multisample image buffer, this wraps the given region with
+ * a singlesample miptree, then creates a multisample miptree into which the
+ * singlesample miptree is embedded as a child.
+ */
+struct intel_mipmap_tree*
+intel_miptree_create_for_image_buffer(struct brw_context *intel,
+                                      enum __DRIimageBufferMask buffer_type,
+                                      gl_format format,
+                                      uint32_t num_samples,
+                                      struct intel_region *region)
+{
+   struct intel_mipmap_tree *singlesample_mt = NULL;
+   struct intel_mipmap_tree *multisample_mt = NULL;
+
+   /* Only the front and back buffers, which are color buffers, are allocated
+    * through the image loader.
+    */
+   assert(_mesa_get_format_base_format(format) == GL_RGB ||
+          _mesa_get_format_base_format(format) == GL_RGBA);
+
+   singlesample_mt = intel_miptree_create_for_bo(intel,
+                                                 region->bo,
+                                                 format,
+                                                 0,
+                                                 region->width,
+                                                 region->height,
+                                                 region->pitch,
+                                                 region->tiling);
+   if (!singlesample_mt)
+      return NULL;
+
    /* If this miptree is capable of supporting fast color clears, set
     * mcs_state appropriately to ensure that fast clears will occur.
     * Allocation of the MCS miptree will be deferred until the first fast
     * clear actually occurs.
     */
    if (intel_is_non_msrt_mcs_buffer_supported(intel, singlesample_mt))
-      singlesample_mt->mcs_state = INTEL_MCS_STATE_RESOLVED;
-#endif
+      singlesample_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
 
    if (num_samples == 0)
       return singlesample_mt;
@@ -737,9 +796,7 @@ intel_miptree_create_for_dri2_buffer(struct intel_context *intel,
    multisample_mt->singlesample_mt = singlesample_mt;
    multisample_mt->need_downsample = false;
 
-   if (intel->is_front_buffer_rendering &&
-       (dri_attachment == __DRI_BUFFER_FRONT_LEFT ||
-        dri_attachment == __DRI_BUFFER_FAKE_FRONT_LEFT)) {
+   if (intel->is_front_buffer_rendering && buffer_type == __DRI_IMAGE_BUFFER_FRONT) {
       intel_miptree_upsample(intel, multisample_mt);
    }
 
@@ -747,7 +804,7 @@ intel_miptree_create_for_dri2_buffer(struct intel_context *intel,
 }
 
 struct intel_mipmap_tree*
-intel_miptree_create_for_renderbuffer(struct intel_context *intel,
+intel_miptree_create_for_renderbuffer(struct brw_context *brw,
                                       gl_format format,
                                       uint32_t width,
                                       uint32_t height,
@@ -757,20 +814,14 @@ intel_miptree_create_for_renderbuffer(struct intel_context *intel,
    uint32_t depth = 1;
    bool ok;
 
-   mt = intel_miptree_create(intel, GL_TEXTURE_2D, format, 0, 0,
+   mt = intel_miptree_create(brw, GL_TEXTURE_2D, format, 0, 0,
                             width, height, depth, true, num_samples,
                              INTEL_MIPTREE_TILING_ANY);
    if (!mt)
       goto fail;
 
-   if (intel->vtbl.is_hiz_depth_format(intel, format)) {
-      ok = intel_miptree_alloc_hiz(intel, mt);
-      if (!ok)
-         goto fail;
-   }
-
-   if (mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
-      ok = intel_miptree_alloc_mcs(intel, mt, num_samples);
+   if (brw_is_hiz_depth_format(brw, format)) {
+      ok = intel_miptree_alloc_hiz(brw, mt);
       if (!ok)
          goto fail;
    }
@@ -815,9 +866,7 @@ intel_miptree_release(struct intel_mipmap_tree **mt)
       intel_region_release(&((*mt)->region));
       intel_miptree_release(&(*mt)->stencil_mt);
       intel_miptree_release(&(*mt)->hiz_mt);
-#ifndef I915
       intel_miptree_release(&(*mt)->mcs_mt);
-#endif
       intel_miptree_release(&(*mt)->singlesample_mt);
       intel_resolve_map_clear(&(*mt)->hiz_map);
 
@@ -999,7 +1048,7 @@ intel_miptree_get_tile_offsets(struct intel_mipmap_tree *mt,
 }
 
 static void
-intel_miptree_copy_slice_sw(struct intel_context *intel,
+intel_miptree_copy_slice_sw(struct brw_context *brw,
                             struct intel_mipmap_tree *dst_mt,
                             struct intel_mipmap_tree *src_mt,
                             int level,
@@ -1011,14 +1060,14 @@ intel_miptree_copy_slice_sw(struct intel_context *intel,
    int src_stride, dst_stride;
    int cpp = dst_mt->cpp;
 
-   intel_miptree_map(intel, src_mt,
+   intel_miptree_map(brw, src_mt,
                      level, slice,
                      0, 0,
                      width, height,
                      GL_MAP_READ_BIT | BRW_MAP_DIRECT_BIT,
                      &src, &src_stride);
 
-   intel_miptree_map(intel, dst_mt,
+   intel_miptree_map(brw, dst_mt,
                      level, slice,
                      0, 0,
                      width, height,
@@ -1045,8 +1094,8 @@ intel_miptree_copy_slice_sw(struct intel_context *intel,
       }
    }
 
-   intel_miptree_unmap(intel, dst_mt, level, slice);
-   intel_miptree_unmap(intel, src_mt, level, slice);
+   intel_miptree_unmap(brw, dst_mt, level, slice);
+   intel_miptree_unmap(brw, src_mt, level, slice);
 
    /* Don't forget to copy the stencil data over, too.  We could have skipped
     * passing BRW_MAP_DIRECT_BIT, but that would have meant intel_miptree_map
@@ -1055,13 +1104,13 @@ intel_miptree_copy_slice_sw(struct intel_context *intel,
     */
    if (dst_mt->stencil_mt) {
       assert(src_mt->stencil_mt);
-      intel_miptree_copy_slice_sw(intel, dst_mt->stencil_mt, src_mt->stencil_mt,
+      intel_miptree_copy_slice_sw(brw, dst_mt->stencil_mt, src_mt->stencil_mt,
                                   level, slice, width, height);
    }
 }
 
 static void
-intel_miptree_copy_slice(struct intel_context *intel,
+intel_miptree_copy_slice(struct brw_context *brw,
                         struct intel_mipmap_tree *dst_mt,
                         struct intel_mipmap_tree *src_mt,
                         int level,
@@ -1092,7 +1141,7 @@ intel_miptree_copy_slice(struct intel_context *intel,
     * stencil's W tiling in the blitter.
     */
    if (src_mt->stencil_mt) {
-      intel_miptree_copy_slice_sw(intel,
+      intel_miptree_copy_slice_sw(brw,
                                   dst_mt, src_mt,
                                   level, slice,
                                   width, height);
@@ -1110,14 +1159,14 @@ intel_miptree_copy_slice(struct intel_context *intel,
        dst_mt, dst_x, dst_y, dst_mt->region->pitch,
        width, height);
 
-   if (!intel_miptree_blit(intel,
+   if (!intel_miptree_blit(brw,
                            src_mt, level, slice, 0, 0, false,
                            dst_mt, level, slice, 0, 0, false,
                            width, height, GL_COPY)) {
       perf_debug("miptree validate blit for %s failed\n",
                  _mesa_get_format_name(format));
 
-      intel_miptree_copy_slice_sw(intel, dst_mt, src_mt, level, slice,
+      intel_miptree_copy_slice_sw(brw, dst_mt, src_mt, level, slice,
                                   width, height);
    }
 }
@@ -1131,7 +1180,7 @@ intel_miptree_copy_slice(struct intel_context *intel,
  * is set to true if we're about to clear the image).
  */
 void
-intel_miptree_copy_teximage(struct intel_context *intel,
+intel_miptree_copy_teximage(struct brw_context *brw,
                            struct intel_texture_image *intelImage,
                            struct intel_mipmap_tree *dst_mt,
                             bool invalidate)
@@ -1145,7 +1194,7 @@ intel_miptree_copy_teximage(struct intel_context *intel,
 
    if (!invalidate) {
       for (int slice = 0; slice < depth; slice++) {
-         intel_miptree_copy_slice(intel, dst_mt, src_mt, level, face, slice);
+         intel_miptree_copy_slice(brw, dst_mt, src_mt, level, face, slice);
       }
    }
 
@@ -1154,14 +1203,11 @@ intel_miptree_copy_teximage(struct intel_context *intel,
 }
 
 bool
-intel_miptree_alloc_mcs(struct intel_context *intel,
+intel_miptree_alloc_mcs(struct brw_context *brw,
                         struct intel_mipmap_tree *mt,
                         GLuint num_samples)
 {
-   assert(intel->gen >= 7); /* MCS only used on Gen7+ */
-#ifdef I915
-   return false;
-#else
+   assert(brw->gen >= 7); /* MCS only used on Gen7+ */
    assert(mt->mcs_mt == NULL);
 
    /* Choose the correct format for the MCS buffer.  All that really matters
@@ -1192,8 +1238,7 @@ intel_miptree_alloc_mcs(struct intel_context *intel,
     *
     *     "The MCS surface must be stored as Tile Y."
     */
-   mt->mcs_state = INTEL_MCS_STATE_MSAA;
-   mt->mcs_mt = intel_miptree_create(intel,
+   mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
                                      mt->first_level,
@@ -1215,23 +1260,19 @@ intel_miptree_alloc_mcs(struct intel_context *intel,
     *
     * Note: the clear value for MCS buffers is all 1's, so we memset to 0xff.
     */
-   void *data = intel_miptree_map_raw(intel, mt->mcs_mt);
+   void *data = intel_miptree_map_raw(brw, mt->mcs_mt);
    memset(data, 0xff, mt->mcs_mt->region->bo->size);
-   intel_miptree_unmap_raw(intel, mt->mcs_mt);
+   intel_miptree_unmap_raw(brw, mt->mcs_mt);
+   mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_CLEAR;
 
    return mt->mcs_mt;
-#endif
 }
 
 
 bool
-intel_miptree_alloc_non_msrt_mcs(struct intel_context *intel,
+intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt)
 {
-#ifdef I915
-   assert(!"MCS not supported on i915");
-   return false;
-#else
    assert(mt->mcs_mt == NULL);
 
    /* The format of the MCS buffer is opaque to the driver; all that matters
@@ -1246,7 +1287,7 @@ intel_miptree_alloc_non_msrt_mcs(struct intel_context *intel,
    const gl_format format = MESA_FORMAT_R_UINT32;
    unsigned block_width_px;
    unsigned block_height;
-   intel_get_non_msrt_mcs_alignment(intel, mt, &block_width_px, &block_height);
+   intel_get_non_msrt_mcs_alignment(brw, mt, &block_width_px, &block_height);
    unsigned width_divisor = block_width_px * 4;
    unsigned height_divisor = block_height * 8;
    unsigned mcs_width =
@@ -1254,7 +1295,7 @@ intel_miptree_alloc_non_msrt_mcs(struct intel_context *intel,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
-   mt->mcs_mt = intel_miptree_create(intel,
+   mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
                                      mt->first_level,
@@ -1267,7 +1308,6 @@ intel_miptree_alloc_non_msrt_mcs(struct intel_context *intel,
                                      INTEL_MIPTREE_TILING_Y);
 
    return mt->mcs_mt;
-#endif
 }
 
 
@@ -1277,37 +1317,23 @@ intel_miptree_alloc_non_msrt_mcs(struct intel_context *intel,
  * \c has_hiz was set.
  */
 static bool
-intel_miptree_slice_enable_hiz(struct intel_context *intel,
+intel_miptree_slice_enable_hiz(struct brw_context *brw,
                                struct intel_mipmap_tree *mt,
                                uint32_t level,
                                uint32_t layer)
 {
    assert(mt->hiz_mt);
 
-   if (intel->is_haswell) {
-      /* Disable HiZ for some slices to work around a hardware bug.
-       *
-       * Haswell hardware fails to respect
-       * 3DSTATE_DEPTH_BUFFER.Depth_Coordinate_Offset_X/Y when during HiZ
-       * ambiguate operations.  The failure is inconsistent and affected by
-       * other GPU contexts. Running a heavy GPU workload in a separate
-       * process causes the failure rate to drop to nearly 0.
-       *
-       * To workaround the bug, we enable HiZ only when we can guarantee that
-       * the Depth Coordinate Offset fields will be set to 0. The function
-       * brw_get_depthstencil_tile_masks() is used to calculate the fields,
-       * and the function is sometimes called in such a way that the presence
-       * of an attached stencil buffer changes the fuction's return value.
-       *
-       * The largest tile size considered by brw_get_depthstencil_tile_masks()
-       * is that of the stencil buffer. Therefore, if this hiz slice's
-       * corresponding depth slice has an offset that is aligned to the
-       * stencil buffer tile size, 64x64 pixels, then
-       * 3DSTATE_DEPTH_BUFFER.Depth_Coordinate_Offset_X/Y is set to 0.
+   if (brw->is_haswell) {
+      const struct intel_mipmap_level *l = &mt->level[level];
+
+      /* Disable HiZ for LOD > 0 unless the width is 8 aligned
+       * and the height is 4 aligned. This allows our HiZ support
+       * to fulfill Haswell restrictions for HiZ ops. For LOD == 0,
+       * we can grow the width & height to allow the HiZ op to
+       * force the proper size alignments.
        */
-      uint32_t depth_x_offset = mt->level[level].slice[layer].x_offset;
-      uint32_t depth_y_offset = mt->level[level].slice[layer].y_offset;
-      if ((depth_x_offset & 63) || (depth_y_offset & 63)) {
+      if (level > 0 && ((l->width & 7) || (l->height & 3))) {
          return false;
       }
    }
@@ -1319,11 +1345,11 @@ intel_miptree_slice_enable_hiz(struct intel_context *intel,
 
 
 bool
-intel_miptree_alloc_hiz(struct intel_context *intel,
+intel_miptree_alloc_hiz(struct brw_context *brw,
                        struct intel_mipmap_tree *mt)
 {
    assert(mt->hiz_mt == NULL);
-   mt->hiz_mt = intel_miptree_create(intel,
+   mt->hiz_mt = intel_miptree_create(brw,
                                      mt->target,
                                      mt->format,
                                      mt->first_level,
@@ -1342,7 +1368,7 @@ intel_miptree_alloc_hiz(struct intel_context *intel,
    struct intel_resolve_map *head = &mt->hiz_map;
    for (int level = mt->first_level; level <= mt->last_level; ++level) {
       for (int layer = 0; layer < mt->level[level].depth; ++layer) {
-         if (!intel_miptree_slice_enable_hiz(intel, mt, level, layer))
+         if (!intel_miptree_slice_enable_hiz(brw, mt, level, layer))
             continue;
 
         head->next = malloc(sizeof(*head->next));
@@ -1396,8 +1422,20 @@ intel_miptree_slice_set_needs_depth_resolve(struct intel_mipmap_tree *mt,
                         level, layer, GEN6_HIZ_OP_DEPTH_RESOLVE);
 }
 
+void
+intel_miptree_set_all_slices_need_depth_resolve(struct intel_mipmap_tree *mt,
+                                                uint32_t level)
+{
+   uint32_t layer;
+   uint32_t end_layer = mt->level[level].depth;
+
+   for (layer = 0; layer < end_layer; layer++) {
+      intel_miptree_slice_set_needs_depth_resolve(mt, level, layer);
+   }
+}
+
 static bool
-intel_miptree_slice_resolve(struct intel_context *intel,
+intel_miptree_slice_resolve(struct brw_context *brw,
                            struct intel_mipmap_tree *mt,
                            uint32_t level,
                            uint32_t layer,
@@ -1411,33 +1449,33 @@ intel_miptree_slice_resolve(struct intel_context *intel,
    if (!item || item->need != need)
       return false;
 
-   intel_hiz_exec(intel, mt, level, layer, need);
+   intel_hiz_exec(brw, mt, level, layer, need);
    intel_resolve_map_remove(item);
    return true;
 }
 
 bool
-intel_miptree_slice_resolve_hiz(struct intel_context *intel,
+intel_miptree_slice_resolve_hiz(struct brw_context *brw,
                                struct intel_mipmap_tree *mt,
                                uint32_t level,
                                uint32_t layer)
 {
-   return intel_miptree_slice_resolve(intel, mt, level, layer,
+   return intel_miptree_slice_resolve(brw, mt, level, layer,
                                      GEN6_HIZ_OP_HIZ_RESOLVE);
 }
 
 bool
-intel_miptree_slice_resolve_depth(struct intel_context *intel,
+intel_miptree_slice_resolve_depth(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
                                  uint32_t level,
                                  uint32_t layer)
 {
-   return intel_miptree_slice_resolve(intel, mt, level, layer,
+   return intel_miptree_slice_resolve(brw, mt, level, layer,
                                      GEN6_HIZ_OP_DEPTH_RESOLVE);
 }
 
 static bool
-intel_miptree_all_slices_resolve(struct intel_context *intel,
+intel_miptree_all_slices_resolve(struct brw_context *brw,
                                 struct intel_mipmap_tree *mt,
                                 enum gen6_hiz_op need)
 {
@@ -1449,7 +1487,7 @@ intel_miptree_all_slices_resolve(struct intel_context *intel,
       if (i->need != need)
         continue;
 
-      intel_hiz_exec(intel, mt, i->level, i->layer, need);
+      intel_hiz_exec(brw, mt, i->level, i->layer, need);
       intel_resolve_map_remove(i);
       did_resolve = true;
    }
@@ -1458,41 +1496,38 @@ intel_miptree_all_slices_resolve(struct intel_context *intel,
 }
 
 bool
-intel_miptree_all_slices_resolve_hiz(struct intel_context *intel,
+intel_miptree_all_slices_resolve_hiz(struct brw_context *brw,
                                     struct intel_mipmap_tree *mt)
 {
-   return intel_miptree_all_slices_resolve(intel, mt,
+   return intel_miptree_all_slices_resolve(brw, mt,
                                           GEN6_HIZ_OP_HIZ_RESOLVE);
 }
 
 bool
-intel_miptree_all_slices_resolve_depth(struct intel_context *intel,
+intel_miptree_all_slices_resolve_depth(struct brw_context *brw,
                                       struct intel_mipmap_tree *mt)
 {
-   return intel_miptree_all_slices_resolve(intel, mt,
+   return intel_miptree_all_slices_resolve(brw, mt,
                                           GEN6_HIZ_OP_DEPTH_RESOLVE);
 }
 
 
 void
-intel_miptree_resolve_color(struct intel_context *intel,
+intel_miptree_resolve_color(struct brw_context *brw,
                             struct intel_mipmap_tree *mt)
 {
-#ifdef I915
-   /* Fast color clear is not supported on the i915 (pre-Gen4) driver */
-#else
-   switch (mt->mcs_state) {
-   case INTEL_MCS_STATE_NONE:
-   case INTEL_MCS_STATE_MSAA:
-   case INTEL_MCS_STATE_RESOLVED:
+   switch (mt->fast_clear_state) {
+   case INTEL_FAST_CLEAR_STATE_NO_MCS:
+   case INTEL_FAST_CLEAR_STATE_RESOLVED:
       /* No resolve needed */
       break;
-   case INTEL_MCS_STATE_UNRESOLVED:
-   case INTEL_MCS_STATE_CLEAR:
-      brw_blorp_resolve_color(intel, mt);
+   case INTEL_FAST_CLEAR_STATE_UNRESOLVED:
+   case INTEL_FAST_CLEAR_STATE_CLEAR:
+      /* Fast color clear resolves only make sense for non-MSAA buffers. */
+      if (mt->msaa_layout == INTEL_MSAA_LAYOUT_NONE)
+         brw_blorp_resolve_color(brw, mt);
       break;
    }
-#endif
 }
 
 
@@ -1501,19 +1536,14 @@ intel_miptree_resolve_color(struct intel_context *intel,
  * process or another miptree.
  *
  * Fast color clears are unsafe with shared buffers, so we need to resolve and
- * then discard the MCS buffer, if present.  We also set the mcs_state to
- * INTEL_MCS_STATE_NONE to ensure that no MCS buffer gets allocated in the
- * future.
+ * then discard the MCS buffer, if present.  We also set the fast_clear_state
+ * to INTEL_FAST_CLEAR_STATE_NO_MCS to ensure that no MCS buffer gets
+ * allocated in the future.
  */
 void
-intel_miptree_make_shareable(struct intel_context *intel,
+intel_miptree_make_shareable(struct brw_context *brw,
                              struct intel_mipmap_tree *mt)
 {
-#ifdef I915
-   /* Nothing needs to be done for I915 */
-   (void) intel;
-   (void) mt;
-#else
    /* MCS buffers are also used for multisample buffers, but we can't resolve
     * away a multisample MCS buffer because it's an integral part of how the
     * pixel data is stored.  Fortunately this code path should never be
@@ -1522,11 +1552,10 @@ intel_miptree_make_shareable(struct intel_context *intel,
    assert(mt->msaa_layout == INTEL_MSAA_LAYOUT_NONE);
 
    if (mt->mcs_mt) {
-      intel_miptree_resolve_color(intel, mt);
+      intel_miptree_resolve_color(brw, mt);
       intel_miptree_release(&mt->mcs_mt);
-      mt->mcs_state = INTEL_MCS_STATE_NONE;
+      mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS;
    }
-#endif
 }
 
 
@@ -1587,38 +1616,36 @@ intel_offset_S8(uint32_t stride, uint32_t x, uint32_t y, bool swizzled)
 }
 
 static void
-intel_miptree_updownsample(struct intel_context *intel,
+intel_miptree_updownsample(struct brw_context *brw,
                            struct intel_mipmap_tree *src,
                            struct intel_mipmap_tree *dst,
                            unsigned width,
                            unsigned height)
 {
-#ifndef I915
    int src_x0 = 0;
    int src_y0 = 0;
    int dst_x0 = 0;
    int dst_y0 = 0;
 
-   brw_blorp_blit_miptrees(intel,
+   brw_blorp_blit_miptrees(brw,
                            src, 0 /* level */, 0 /* layer */,
                            dst, 0 /* level */, 0 /* layer */,
                            src_x0, src_y0,
                            width, height,
                            dst_x0, dst_y0,
                            width, height,
-                           false, false /*mirror x, y*/);
+                           GL_NEAREST, false, false /*mirror x, y*/);
 
    if (src->stencil_mt) {
-      brw_blorp_blit_miptrees(intel,
+      brw_blorp_blit_miptrees(brw,
                               src->stencil_mt, 0 /* level */, 0 /* layer */,
                               dst->stencil_mt, 0 /* level */, 0 /* layer */,
                               src_x0, src_y0,
                               width, height,
                               dst_x0, dst_y0,
                               width, height,
-                              false, false /*mirror x, y*/);
+                              GL_NEAREST, false, false /*mirror x, y*/);
    }
-#endif /* I915 */
 }
 
 static void
@@ -1635,7 +1662,7 @@ assert_is_flat(struct intel_mipmap_tree *mt)
  * If the miptree needs no downsample, then skip.
  */
 void
-intel_miptree_downsample(struct intel_context *intel,
+intel_miptree_downsample(struct brw_context *brw,
                          struct intel_mipmap_tree *mt)
 {
    /* Only flat, renderbuffer-like miptrees are supported. */
@@ -1643,7 +1670,7 @@ intel_miptree_downsample(struct intel_context *intel,
 
    if (!mt->need_downsample)
       return;
-   intel_miptree_updownsample(intel,
+   intel_miptree_updownsample(brw,
                               mt, mt->singlesample_mt,
                               mt->logical_width0,
                               mt->logical_height0);
@@ -1656,36 +1683,36 @@ intel_miptree_downsample(struct intel_context *intel,
  * The upsample is done unconditionally.
  */
 void
-intel_miptree_upsample(struct intel_context *intel,
+intel_miptree_upsample(struct brw_context *brw,
                        struct intel_mipmap_tree *mt)
 {
    /* Only flat, renderbuffer-like miptrees are supported. */
    assert_is_flat(mt);
    assert(!mt->need_downsample);
 
-   intel_miptree_updownsample(intel,
+   intel_miptree_updownsample(brw,
                               mt->singlesample_mt, mt,
                               mt->logical_width0,
                               mt->logical_height0);
 }
 
 void *
-intel_miptree_map_raw(struct intel_context *intel, struct intel_mipmap_tree *mt)
+intel_miptree_map_raw(struct brw_context *brw, struct intel_mipmap_tree *mt)
 {
    /* CPU accesses to color buffers don't understand fast color clears, so
     * resolve any pending fast color clears before we map.
     */
-   intel_miptree_resolve_color(intel, mt);
+   intel_miptree_resolve_color(brw, mt);
 
    drm_intel_bo *bo = mt->region->bo;
 
    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
       if (drm_intel_bo_busy(bo)) {
-         perf_debug("Mapping a busy BO, causing a stall on the GPU.\n");
+         perf_debug("Mapping a busy miptree, causing a stall on the GPU.\n");
       }
    }
 
-   intel_flush(&intel->ctx);
+   intel_batchbuffer_flush(brw);
 
    if (mt->region->tiling != I915_TILING_NONE)
       drm_intel_gem_bo_map_gtt(bo);
@@ -1696,14 +1723,14 @@ intel_miptree_map_raw(struct intel_context *intel, struct intel_mipmap_tree *mt)
 }
 
 void
-intel_miptree_unmap_raw(struct intel_context *intel,
+intel_miptree_unmap_raw(struct brw_context *brw,
                         struct intel_mipmap_tree *mt)
 {
    drm_intel_bo_unmap(mt->region->bo);
 }
 
 static void
-intel_miptree_map_gtt(struct intel_context *intel,
+intel_miptree_map_gtt(struct brw_context *brw,
                      struct intel_mipmap_tree *mt,
                      struct intel_miptree_map *map,
                      unsigned int level, unsigned int slice)
@@ -1722,7 +1749,7 @@ intel_miptree_map_gtt(struct intel_context *intel,
    assert(y % bh == 0);
    y /= bh;
 
-   base = intel_miptree_map_raw(intel, mt) + mt->offset;
+   base = intel_miptree_map_raw(brw, mt) + mt->offset;
 
    if (base == NULL)
       map->ptr = NULL;
@@ -1745,22 +1772,22 @@ intel_miptree_map_gtt(struct intel_context *intel,
 }
 
 static void
-intel_miptree_unmap_gtt(struct intel_context *intel,
+intel_miptree_unmap_gtt(struct brw_context *brw,
                        struct intel_mipmap_tree *mt,
                        struct intel_miptree_map *map,
                        unsigned int level,
                        unsigned int slice)
 {
-   intel_miptree_unmap_raw(intel, mt);
+   intel_miptree_unmap_raw(brw, mt);
 }
 
 static void
-intel_miptree_map_blit(struct intel_context *intel,
+intel_miptree_map_blit(struct brw_context *brw,
                       struct intel_mipmap_tree *mt,
                       struct intel_miptree_map *map,
                       unsigned int level, unsigned int slice)
 {
-   map->mt = intel_miptree_create(intel, GL_TEXTURE_2D, mt->format,
+   map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
                                   false, 0,
@@ -1771,7 +1798,7 @@ intel_miptree_map_blit(struct intel_context *intel,
    }
    map->stride = map->mt->region->pitch;
 
-   if (!intel_miptree_blit(intel,
+   if (!intel_miptree_blit(brw,
                            mt, level, slice,
                            map->x, map->y, false,
                            map->mt, 0, 0,
@@ -1781,8 +1808,7 @@ intel_miptree_map_blit(struct intel_context *intel,
       goto fail;
    }
 
-   intel_batchbuffer_flush(intel);
-   map->ptr = intel_miptree_map_raw(intel, map->mt);
+   map->ptr = intel_miptree_map_raw(brw, map->mt);
 
    DBG("%s: %d,%d %dx%d from mt %p (%s) %d,%d = %p/%d\n", __FUNCTION__,
        map->x, map->y, map->w, map->h,
@@ -1798,18 +1824,18 @@ fail:
 }
 
 static void
-intel_miptree_unmap_blit(struct intel_context *intel,
+intel_miptree_unmap_blit(struct brw_context *brw,
                         struct intel_mipmap_tree *mt,
                         struct intel_miptree_map *map,
                         unsigned int level,
                         unsigned int slice)
 {
-   struct gl_context *ctx = &intel->ctx;
+   struct gl_context *ctx = &brw->ctx;
 
-   intel_miptree_unmap_raw(intel, map->mt);
+   intel_miptree_unmap_raw(brw, map->mt);
 
    if (map->mode & GL_MAP_WRITE_BIT) {
-      bool ok = intel_miptree_blit(intel,
+      bool ok = intel_miptree_blit(brw,
                                    map->mt, 0, 0,
                                    0, 0, false,
                                    mt, level, slice,
@@ -1821,8 +1847,81 @@ intel_miptree_unmap_blit(struct intel_context *intel,
    intel_miptree_release(&map->mt);
 }
 
+#ifdef __SSE4_1__
+/**
+ * "Map" a buffer by copying it to an untiled temporary using MOVNTDQA.
+ */
 static void
-intel_miptree_map_s8(struct intel_context *intel,
+intel_miptree_map_movntdqa(struct brw_context *brw,
+                           struct intel_mipmap_tree *mt,
+                           struct intel_miptree_map *map,
+                           unsigned int level, unsigned int slice)
+{
+   assert(map->mode & GL_MAP_READ_BIT);
+   assert(!(map->mode & GL_MAP_WRITE_BIT));
+
+   DBG("%s: %d,%d %dx%d from mt %p (%s) %d,%d = %p/%d\n", __FUNCTION__,
+       map->x, map->y, map->w, map->h,
+       mt, _mesa_get_format_name(mt->format),
+       level, slice, map->ptr, map->stride);
+
+   /* Map the original image */
+   uint32_t image_x;
+   uint32_t image_y;
+   intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
+   image_x += map->x;
+   image_y += map->y;
+
+   void *src = intel_miptree_map_raw(brw, mt);
+   if (!src)
+      return;
+   src += image_y * mt->region->pitch;
+   src += image_x * mt->region->cpp;
+
+   /* Due to the pixel offsets for the particular image being mapped, our
+    * src pointer may not be 16-byte aligned.  However, if the pitch is
+    * divisible by 16, then the amount by which it's misaligned will remain
+    * consistent from row to row.
+    */
+   assert((mt->region->pitch % 16) == 0);
+   const int misalignment = ((uintptr_t) src) & 15;
+
+   /* Create an untiled temporary buffer for the mapping. */
+   const unsigned width_bytes = _mesa_format_row_stride(mt->format, map->w);
+
+   map->stride = ALIGN(misalignment + width_bytes, 16);
+
+   map->buffer = malloc(map->stride * map->h);
+   /* Offset the destination so it has the same misalignment as src. */
+   map->ptr = map->buffer + misalignment;
+
+   assert((((uintptr_t) map->ptr) & 15) == misalignment);
+
+   for (uint32_t y = 0; y < map->h; y++) {
+      void *dst_ptr = map->ptr + y * map->stride;
+      void *src_ptr = src + y * mt->region->pitch;
+
+      _mesa_streaming_load_memcpy(dst_ptr, src_ptr, width_bytes);
+   }
+
+   intel_miptree_unmap_raw(brw, mt);
+}
+
+static void
+intel_miptree_unmap_movntdqa(struct brw_context *brw,
+                             struct intel_mipmap_tree *mt,
+                             struct intel_miptree_map *map,
+                             unsigned int level,
+                             unsigned int slice)
+{
+   free(map->buffer);
+   map->buffer = NULL;
+   map->ptr = NULL;
+}
+#endif
+
+static void
+intel_miptree_map_s8(struct brw_context *brw,
                     struct intel_mipmap_tree *mt,
                     struct intel_miptree_map *map,
                     unsigned int level, unsigned int slice)
@@ -1839,7 +1938,7 @@ intel_miptree_map_s8(struct intel_context *intel,
     */
    if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
       uint8_t *untiled_s8_map = map->ptr;
-      uint8_t *tiled_s8_map = intel_miptree_map_raw(intel, mt);
+      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt);
       unsigned int image_x, image_y;
 
       intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
@@ -1849,12 +1948,12 @@ intel_miptree_map_s8(struct intel_context *intel,
            ptrdiff_t offset = intel_offset_S8(mt->region->pitch,
                                               x + image_x + map->x,
                                               y + image_y + map->y,
-                                              intel->has_swizzling);
+                                              brw->has_swizzling);
            untiled_s8_map[y * map->w + x] = tiled_s8_map[offset];
         }
       }
 
-      intel_miptree_unmap_raw(intel, mt);
+      intel_miptree_unmap_raw(brw, mt);
 
       DBG("%s: %d,%d %dx%d from mt %p %d,%d = %p/%d\n", __FUNCTION__,
          map->x, map->y, map->w, map->h,
@@ -1867,7 +1966,7 @@ intel_miptree_map_s8(struct intel_context *intel,
 }
 
 static void
-intel_miptree_unmap_s8(struct intel_context *intel,
+intel_miptree_unmap_s8(struct brw_context *brw,
                       struct intel_mipmap_tree *mt,
                       struct intel_miptree_map *map,
                       unsigned int level,
@@ -1876,7 +1975,7 @@ intel_miptree_unmap_s8(struct intel_context *intel,
    if (map->mode & GL_MAP_WRITE_BIT) {
       unsigned int image_x, image_y;
       uint8_t *untiled_s8_map = map->ptr;
-      uint8_t *tiled_s8_map = intel_miptree_map_raw(intel, mt);
+      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt);
 
       intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
 
@@ -1885,19 +1984,19 @@ intel_miptree_unmap_s8(struct intel_context *intel,
            ptrdiff_t offset = intel_offset_S8(mt->region->pitch,
                                               x + map->x,
                                               y + map->y,
-                                              intel->has_swizzling);
+                                              brw->has_swizzling);
            tiled_s8_map[offset] = untiled_s8_map[y * map->w + x];
         }
       }
 
-      intel_miptree_unmap_raw(intel, mt);
+      intel_miptree_unmap_raw(brw, mt);
    }
 
    free(map->buffer);
 }
 
 static void
-intel_miptree_map_etc(struct intel_context *intel,
+intel_miptree_map_etc(struct brw_context *brw,
                       struct intel_mipmap_tree *mt,
                       struct intel_miptree_map *map,
                       unsigned int level,
@@ -1918,7 +2017,7 @@ intel_miptree_map_etc(struct intel_context *intel,
 }
 
 static void
-intel_miptree_unmap_etc(struct intel_context *intel,
+intel_miptree_unmap_etc(struct brw_context *brw,
                         struct intel_mipmap_tree *mt,
                         struct intel_miptree_map *map,
                         unsigned int level,
@@ -1931,7 +2030,7 @@ intel_miptree_unmap_etc(struct intel_context *intel,
    image_x += map->x;
    image_y += map->y;
 
-   uint8_t *dst = intel_miptree_map_raw(intel, mt)
+   uint8_t *dst = intel_miptree_map_raw(brw, mt)
                 + image_y * mt->region->pitch
                 + image_x * mt->region->cpp;
 
@@ -1944,7 +2043,7 @@ intel_miptree_unmap_etc(struct intel_context *intel,
                                map->ptr, map->stride,
                                map->w, map->h, mt->etc_format);
 
-   intel_miptree_unmap_raw(intel, mt);
+   intel_miptree_unmap_raw(brw, mt);
    free(map->buffer);
 }
 
@@ -1960,7 +2059,7 @@ intel_miptree_unmap_etc(struct intel_context *intel,
  * copying the data between the actual backing store and the temporary.
  */
 static void
-intel_miptree_map_depthstencil(struct intel_context *intel,
+intel_miptree_map_depthstencil(struct brw_context *brw,
                               struct intel_mipmap_tree *mt,
                               struct intel_miptree_map *map,
                               unsigned int level, unsigned int slice)
@@ -1982,8 +2081,8 @@ intel_miptree_map_depthstencil(struct intel_context *intel,
     */
    if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
       uint32_t *packed_map = map->ptr;
-      uint8_t *s_map = intel_miptree_map_raw(intel, s_mt);
-      uint32_t *z_map = intel_miptree_map_raw(intel, z_mt);
+      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt);
+      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt);
       unsigned int s_image_x, s_image_y;
       unsigned int z_image_x, z_image_y;
 
@@ -1998,7 +2097,7 @@ intel_miptree_map_depthstencil(struct intel_context *intel,
            ptrdiff_t s_offset = intel_offset_S8(s_mt->region->pitch,
                                                 map_x + s_image_x,
                                                 map_y + s_image_y,
-                                                intel->has_swizzling);
+                                                brw->has_swizzling);
            ptrdiff_t z_offset = ((map_y + z_image_y) *
                                   (z_mt->region->pitch / 4) +
                                  (map_x + z_image_x));
@@ -2014,8 +2113,8 @@ intel_miptree_map_depthstencil(struct intel_context *intel,
         }
       }
 
-      intel_miptree_unmap_raw(intel, s_mt);
-      intel_miptree_unmap_raw(intel, z_mt);
+      intel_miptree_unmap_raw(brw, s_mt);
+      intel_miptree_unmap_raw(brw, z_mt);
 
       DBG("%s: %d,%d %dx%d from z mt %p %d,%d, s mt %p %d,%d = %p/%d\n",
          __FUNCTION__,
@@ -2031,7 +2130,7 @@ intel_miptree_map_depthstencil(struct intel_context *intel,
 }
 
 static void
-intel_miptree_unmap_depthstencil(struct intel_context *intel,
+intel_miptree_unmap_depthstencil(struct brw_context *brw,
                                 struct intel_mipmap_tree *mt,
                                 struct intel_miptree_map *map,
                                 unsigned int level,
@@ -2043,8 +2142,8 @@ intel_miptree_unmap_depthstencil(struct intel_context *intel,
 
    if (map->mode & GL_MAP_WRITE_BIT) {
       uint32_t *packed_map = map->ptr;
-      uint8_t *s_map = intel_miptree_map_raw(intel, s_mt);
-      uint32_t *z_map = intel_miptree_map_raw(intel, z_mt);
+      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt);
+      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt);
       unsigned int s_image_x, s_image_y;
       unsigned int z_image_x, z_image_y;
 
@@ -2058,7 +2157,7 @@ intel_miptree_unmap_depthstencil(struct intel_context *intel,
            ptrdiff_t s_offset = intel_offset_S8(s_mt->region->pitch,
                                                 x + s_image_x + map->x,
                                                 y + s_image_y + map->y,
-                                                intel->has_swizzling);
+                                                brw->has_swizzling);
            ptrdiff_t z_offset = ((y + z_image_y) *
                                   (z_mt->region->pitch / 4) +
                                  (x + z_image_x));
@@ -2074,8 +2173,8 @@ intel_miptree_unmap_depthstencil(struct intel_context *intel,
         }
       }
 
-      intel_miptree_unmap_raw(intel, s_mt);
-      intel_miptree_unmap_raw(intel, z_mt);
+      intel_miptree_unmap_raw(brw, s_mt);
+      intel_miptree_unmap_raw(brw, z_mt);
 
       DBG("%s: %d,%d %dx%d from z mt %p (%s) %d,%d, s mt %p %d,%d = %p/%d\n",
          __FUNCTION__,
@@ -2135,8 +2234,24 @@ intel_miptree_release_map(struct intel_mipmap_tree *mt,
    *map = NULL;
 }
 
+static bool
+can_blit_slice(struct intel_mipmap_tree *mt,
+               unsigned int level, unsigned int slice)
+{
+   uint32_t image_x;
+   uint32_t image_y;
+   intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
+   if (image_x >= 32768 || image_y >= 32768)
+      return false;
+
+   if (mt->region->pitch >= 32768)
+      return false;
+
+   return true;
+}
+
 static void
-intel_miptree_map_singlesample(struct intel_context *intel,
+intel_miptree_map_singlesample(struct brw_context *brw,
                                struct intel_mipmap_tree *mt,
                                unsigned int level,
                                unsigned int slice,
@@ -2159,33 +2274,37 @@ intel_miptree_map_singlesample(struct intel_context *intel,
       return;
    }
 
-   intel_miptree_slice_resolve_depth(intel, mt, level, slice);
+   intel_miptree_slice_resolve_depth(brw, mt, level, slice);
    if (map->mode & GL_MAP_WRITE_BIT) {
       intel_miptree_slice_set_needs_hiz_resolve(mt, level, slice);
    }
 
    if (mt->format == MESA_FORMAT_S8) {
-      intel_miptree_map_s8(intel, mt, map, level, slice);
+      intel_miptree_map_s8(brw, mt, map, level, slice);
    } else if (mt->etc_format != MESA_FORMAT_NONE &&
               !(mode & BRW_MAP_DIRECT_BIT)) {
-      intel_miptree_map_etc(intel, mt, map, level, slice);
+      intel_miptree_map_etc(brw, mt, map, level, slice);
    } else if (mt->stencil_mt && !(mode & BRW_MAP_DIRECT_BIT)) {
-      intel_miptree_map_depthstencil(intel, mt, map, level, slice);
+      intel_miptree_map_depthstencil(brw, mt, map, level, slice);
    }
    /* See intel_miptree_blit() for details on the 32k pitch limit. */
-   else if (intel->has_llc &&
+   else if (brw->has_llc &&
             !(mode & GL_MAP_WRITE_BIT) &&
             !mt->compressed &&
             (mt->region->tiling == I915_TILING_X ||
-             (intel->gen >= 6 && mt->region->tiling == I915_TILING_Y)) &&
-            mt->region->pitch < 32768) {
-      intel_miptree_map_blit(intel, mt, map, level, slice);
+             (brw->gen >= 6 && mt->region->tiling == I915_TILING_Y)) &&
+            can_blit_slice(mt, level, slice)) {
+      intel_miptree_map_blit(brw, mt, map, level, slice);
    } else if (mt->region->tiling != I915_TILING_NONE &&
-              mt->region->bo->size >= intel->max_gtt_map_object_size) {
-      assert(mt->region->pitch < 32768);
-      intel_miptree_map_blit(intel, mt, map, level, slice);
+              mt->region->bo->size >= brw->max_gtt_map_object_size) {
+      assert(can_blit_slice(mt, level, slice));
+      intel_miptree_map_blit(brw, mt, map, level, slice);
+#ifdef __SSE4_1__
+   } else if (!(mode & GL_MAP_WRITE_BIT) && !mt->compressed) {
+      intel_miptree_map_movntdqa(brw, mt, map, level, slice);
+#endif
    } else {
-      intel_miptree_map_gtt(intel, mt, map, level, slice);
+      intel_miptree_map_gtt(brw, mt, map, level, slice);
    }
 
    *out_ptr = map->ptr;
@@ -2196,7 +2315,7 @@ intel_miptree_map_singlesample(struct intel_context *intel,
 }
 
 static void
-intel_miptree_unmap_singlesample(struct intel_context *intel,
+intel_miptree_unmap_singlesample(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
                                  unsigned int level,
                                  unsigned int slice)
@@ -2212,23 +2331,27 @@ intel_miptree_unmap_singlesample(struct intel_context *intel,
        mt, _mesa_get_format_name(mt->format), level, slice);
 
    if (mt->format == MESA_FORMAT_S8) {
-      intel_miptree_unmap_s8(intel, mt, map, level, slice);
+      intel_miptree_unmap_s8(brw, mt, map, level, slice);
    } else if (mt->etc_format != MESA_FORMAT_NONE &&
               !(map->mode & BRW_MAP_DIRECT_BIT)) {
-      intel_miptree_unmap_etc(intel, mt, map, level, slice);
+      intel_miptree_unmap_etc(brw, mt, map, level, slice);
    } else if (mt->stencil_mt && !(map->mode & BRW_MAP_DIRECT_BIT)) {
-      intel_miptree_unmap_depthstencil(intel, mt, map, level, slice);
+      intel_miptree_unmap_depthstencil(brw, mt, map, level, slice);
    } else if (map->mt) {
-      intel_miptree_unmap_blit(intel, mt, map, level, slice);
+      intel_miptree_unmap_blit(brw, mt, map, level, slice);
+#ifdef __SSE4_1__
+   } else if (map->buffer) {
+      intel_miptree_unmap_movntdqa(brw, mt, map, level, slice);
+#endif
    } else {
-      intel_miptree_unmap_gtt(intel, mt, map, level, slice);
+      intel_miptree_unmap_gtt(brw, mt, map, level, slice);
    }
 
    intel_miptree_release_map(mt, level, slice);
 }
 
 static void
-intel_miptree_map_multisample(struct intel_context *intel,
+intel_miptree_map_multisample(struct brw_context *brw,
                               struct intel_mipmap_tree *mt,
                               unsigned int level,
                               unsigned int slice,
@@ -2240,6 +2363,7 @@ intel_miptree_map_multisample(struct intel_context *intel,
                               void **out_ptr,
                               int *out_stride)
 {
+   struct gl_context *ctx = &brw->ctx;
    struct intel_miptree_map *map;
 
    assert(mt->num_samples > 1);
@@ -2248,7 +2372,7 @@ intel_miptree_map_multisample(struct intel_context *intel,
    if (mt->target != GL_TEXTURE_2D ||
        mt->first_level != 0 ||
        mt->last_level != 0) {
-      _mesa_problem(&intel->ctx, "attempt to map a multisample miptree for "
+      _mesa_problem(ctx, "attempt to map a multisample miptree for "
                     "which (target, first_level, last_level != "
                     "(GL_TEXTURE_2D, 0, 0)");
       goto fail;
@@ -2260,7 +2384,7 @@ intel_miptree_map_multisample(struct intel_context *intel,
 
    if (!mt->singlesample_mt) {
       mt->singlesample_mt =
-         intel_miptree_create_for_renderbuffer(intel,
+         intel_miptree_create_for_renderbuffer(brw,
                                                mt->format,
                                                mt->logical_width0,
                                                mt->logical_height0,
@@ -2272,8 +2396,8 @@ intel_miptree_map_multisample(struct intel_context *intel,
       mt->need_downsample = true;
    }
 
-   intel_miptree_downsample(intel, mt);
-   intel_miptree_map_singlesample(intel, mt->singlesample_mt,
+   intel_miptree_downsample(brw, mt);
+   intel_miptree_map_singlesample(brw, mt->singlesample_mt,
                                   level, slice,
                                   x, y, w, h,
                                   mode,
@@ -2287,7 +2411,7 @@ fail:
 }
 
 static void
-intel_miptree_unmap_multisample(struct intel_context *intel,
+intel_miptree_unmap_multisample(struct brw_context *brw,
                                 struct intel_mipmap_tree *mt,
                                 unsigned int level,
                                 unsigned int slice)
@@ -2299,11 +2423,11 @@ intel_miptree_unmap_multisample(struct intel_context *intel,
    if (!map)
       return;
 
-   intel_miptree_unmap_singlesample(intel, mt->singlesample_mt, level, slice);
+   intel_miptree_unmap_singlesample(brw, mt->singlesample_mt, level, slice);
 
    mt->need_downsample = false;
    if (map->mode & GL_MAP_WRITE_BIT)
-      intel_miptree_upsample(intel, mt);
+      intel_miptree_upsample(brw, mt);
 
    if (map->singlesample_mt_is_tmp)
       intel_miptree_release(&mt->singlesample_mt);
@@ -2312,7 +2436,7 @@ intel_miptree_unmap_multisample(struct intel_context *intel,
 }
 
 void
-intel_miptree_map(struct intel_context *intel,
+intel_miptree_map(struct brw_context *brw,
                  struct intel_mipmap_tree *mt,
                  unsigned int level,
                  unsigned int slice,
@@ -2325,13 +2449,13 @@ intel_miptree_map(struct intel_context *intel,
                  int *out_stride)
 {
    if (mt->num_samples <= 1)
-      intel_miptree_map_singlesample(intel, mt,
+      intel_miptree_map_singlesample(brw, mt,
                                      level, slice,
                                      x, y, w, h,
                                      mode,
                                      out_ptr, out_stride);
    else
-      intel_miptree_map_multisample(intel, mt,
+      intel_miptree_map_multisample(brw, mt,
                                     level, slice,
                                     x, y, w, h,
                                     mode,
@@ -2339,13 +2463,13 @@ intel_miptree_map(struct intel_context *intel,
 }
 
 void
-intel_miptree_unmap(struct intel_context *intel,
+intel_miptree_unmap(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
                    unsigned int level,
                    unsigned int slice)
 {
    if (mt->num_samples <= 1)
-      intel_miptree_unmap_singlesample(intel, mt, level, slice);
+      intel_miptree_unmap_singlesample(brw, mt, level, slice);
    else
-      intel_miptree_unmap_multisample(intel, mt, level, slice);
+      intel_miptree_unmap_multisample(brw, mt, level, slice);
 }