vk: Add four unit tests for our lock-free data-structures

[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c

index c420ab9e96b326872412e0ad29c0cd50f67175de..ed659ed625eb7546ea992df368e1a09edc113d14 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -30,31 +30,15 @@
  #include "intel_reg.h"
  #include "intel_bufmgr.h"
  #include "intel_buffers.h"
+#include "intel_fbo.h"
  #include "brw_context.h"
  
+#include <xf86drm.h>
+#include <i915_drm.h>
+
  static void
  intel_batchbuffer_reset(struct brw_context *brw);
  
-struct cached_batch_item {
-   struct cached_batch_item *next;
-   uint16_t header;
-   uint16_t size;
-};
-
-void
-intel_batchbuffer_clear_cache(struct brw_context *brw)
-{
-   struct cached_batch_item *item = brw->batch.cached_items;
-
-   while (item) {
-      struct cached_batch_item *next = item->next;
-      free(item);
-      item = next;
-   }
-
-   brw->batch.cached_items = NULL;
-}
-
  void
  intel_batchbuffer_init(struct brw_context *brw)
  {
@@ -70,8 +54,6 @@ intel_batchbuffer_init(struct brw_context *brw)
                                                       4096, 4096);
     }
  
-   brw->batch.need_workaround_flush = true;
-
     if (!brw->has_llc) {
        brw->batch.cpu_map = malloc(BATCH_SZ);
        brw->batch.map = brw->batch.cpu_map;
@@ -87,7 +69,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
     }
     brw->batch.last_bo = brw->batch.bo;
  
-   intel_batchbuffer_clear_cache(brw);
+   brw_render_cache_set_clear(brw);
  
     brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
                                         BATCH_SZ, 4096);
@@ -100,6 +82,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
     brw->batch.state_batch_offset = brw->batch.bo->size;
     brw->batch.used = 0;
     brw->batch.needs_sol_reset = false;
+   brw->batch.pipe_controls_since_last_cs_stall = 0;
  
     /* We don't know what ring the new batch will be sent to until we see the
      * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
@@ -123,11 +106,6 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
     brw->batch.used = brw->batch.saved.used;
     if (brw->batch.used == 0)
        brw->batch.ring = UNKNOWN_RING;
-
-   /* Cached batch state is dead, since we just cleared some unknown part of the
-    * batchbuffer.  Assume that the caller resets any other state necessary.
-    */
-   intel_batchbuffer_clear_cache(brw);
  }
  
  void
@@ -137,7 +115,6 @@ intel_batchbuffer_free(struct brw_context *brw)
     drm_intel_bo_unreference(brw->batch.last_bo);
     drm_intel_bo_unreference(brw->batch.bo);
     drm_intel_bo_unreference(brw->batch.workaround_bo);
-   intel_batchbuffer_clear_cache(brw);
  }
  
  static void
@@ -168,6 +145,7 @@ do_batch_dump(struct brw_context *brw)
                                          batch->used);
     }
  
+   drm_intel_decode_set_output_file(decode, stderr);
     drm_intel_decode(decode);
  
     drm_intel_decode_context_free(decode);
@@ -193,6 +171,7 @@ static void
  brw_new_batch(struct brw_context *brw)
  {
     /* Create a new batchbuffer and reset the associated state: */
+   drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
     intel_batchbuffer_reset(brw);
  
     /* If the kernel supports hardware contexts, then most hardware state is
@@ -202,14 +181,9 @@ brw_new_batch(struct brw_context *brw)
      * purposes means everything).
      */
     if (brw->hw_ctx == NULL)
-      brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
  
-   brw->state.dirty.brw |= BRW_NEW_BATCH;
-
-   /* Assume that the last command before the start of our batch was a
-    * primitive, for safety.
-    */
-   brw->batch.need_workaround_flush = true;
+   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
  
     brw->state_batch_count = 0;
  
@@ -248,12 +222,6 @@ brw_finish_batch(struct brw_context *brw)
     if (brw->batch.ring == RENDER_RING)
        brw_perf_monitor_finish_batch(brw);
  
-   if (brw->curbe.curbe_bo) {
-      drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo);
-      drm_intel_bo_unreference(brw->curbe.curbe_bo);
-      brw->curbe.curbe_bo = NULL;
-   }
-
     /* Mark that the current program cache BO has been used by the GPU.
      * It will be reallocated if we need to put new programs in for the
      * next batch.
@@ -261,6 +229,44 @@ brw_finish_batch(struct brw_context *brw)
     brw->cache.bo_used_by_gpu = true;
  }
  
+static void
+throttle(struct brw_context *brw)
+{
+   /* Wait for the swapbuffers before the one we just emitted, so we
+    * don't get too many swaps outstanding for apps that are GPU-heavy
+    * but not CPU-heavy.
+    *
+    * We're using intelDRI2Flush (called from the loader before
+    * swapbuffer) and glFlush (for front buffer rendering) as the
+    * indicator that a frame is done and then throttle when we get
+    * here as we prepare to render the next frame.  At this point for
+    * round trips for swap/copy and getting new buffers are done and
+    * we'll spend less time waiting on the GPU.
+    *
+    * Unfortunately, we don't have a handle to the batch containing
+    * the swap, and getting our hands on that doesn't seem worth it,
+    * so we just use the first batch we emitted after the last swap.
+    */
+   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
+      if (brw->throttle_batch[1]) {
+         if (!brw->disable_throttling)
+            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
+         drm_intel_bo_unreference(brw->throttle_batch[1]);
+      }
+      brw->throttle_batch[1] = brw->throttle_batch[0];
+      brw->throttle_batch[0] = NULL;
+      brw->need_swap_throttle = false;
+      /* Throttling here is more precise than the throttle ioctl, so skip it */
+      brw->need_flush_throttle = false;
+   }
+
+   if (brw->need_flush_throttle) {
+      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
+      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
+      brw->need_flush_throttle = false;
+   }
+}
+
  /* TODO: Push this whole function into bufmgr.
   */
  static int
@@ -295,6 +301,7 @@ do_flush_locked(struct brw_context *brw)
        if (ret == 0) {
           if (unlikely(INTEL_DEBUG & DEBUG_AUB))
              brw_annotate_aub(brw);
+
          if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
             ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
                                         flags);
@@ -303,6 +310,8 @@ do_flush_locked(struct brw_context *brw)
                                                 4 * batch->used, flags);
          }
        }
+
+      throttle(brw);
     }
  
     if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
@@ -325,9 +334,9 @@ _intel_batchbuffer_flush(struct brw_context *brw,
     if (brw->batch.used == 0)
        return 0;
  
-   if (brw->first_post_swapbuffers_batch == NULL) {
-      brw->first_post_swapbuffers_batch = brw->batch.bo;
-      drm_intel_bo_reference(brw->first_post_swapbuffers_batch);
+   if (brw->throttle_batch[0] == NULL) {
+      brw->throttle_batch[0] = brw->batch.bo;
+      drm_intel_bo_reference(brw->throttle_batch[0]);
     }
  
     if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
@@ -387,26 +396,199 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw,
     assert(ret == 0);
     (void)ret;
  
-   /*
-    * Using the old buffer offset, write in what the right data would be, in case
-    * the buffer doesn't move and we can short-circuit the relocation processing
-    * in the kernel
+   /* Using the old buffer offset, write in what the right data would be, in
+    * case the buffer doesn't move and we can short-circuit the relocation
+    * processing in the kernel
      */
     intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
  
     return true;
  }
  
+bool
+intel_batchbuffer_emit_reloc64(struct brw_context *brw,
+                               drm_intel_bo *buffer,
+                               uint32_t read_domains, uint32_t write_domain,
+                              uint32_t delta)
+{
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+                                     buffer, delta,
+                                     read_domains, write_domain);
+   assert(ret == 0);
+   (void) ret;
+
+   /* Using the old buffer offset, write in what the right data would be, in
+    * case the buffer doesn't move and we can short-circuit the relocation
+    * processing in the kernel
+    */
+   uint64_t offset = buffer->offset64 + delta;
+   intel_batchbuffer_emit_dword(brw, offset);
+   intel_batchbuffer_emit_dword(brw, offset >> 32);
+
+   return true;
+}
+
+
  void
  intel_batchbuffer_data(struct brw_context *brw,
                         const void *data, GLuint bytes, enum brw_gpu_ring ring)
  {
     assert((bytes & 3) == 0);
     intel_batchbuffer_require_space(brw, bytes, ring);
-   __memcpy(brw->batch.map + brw->batch.used, data, bytes);
+   memcpy(brw->batch.map + brw->batch.used, data, bytes);
     brw->batch.used += bytes >> 2;
  }
  
+/**
+ * According to the latest documentation, any PIPE_CONTROL with the
+ * "Command Streamer Stall" bit set must also have another bit set,
+ * with five different options:
+ *
+ *  - Render Target Cache Flush
+ *  - Depth Cache Flush
+ *  - Stall at Pixel Scoreboard
+ *  - Post-Sync Operation
+ *  - Depth Stall
+ *
+ * I chose "Stall at Pixel Scoreboard" since we've used it effectively
+ * in the past, but the choice is fairly arbitrary.
+ */
+static void
+gen8_add_cs_stall_workaround_bits(uint32_t *flags)
+{
+   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                      PIPE_CONTROL_WRITE_IMMEDIATE |
+                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                      PIPE_CONTROL_WRITE_TIMESTAMP |
+                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                      PIPE_CONTROL_DEPTH_STALL;
+
+   /* If we're doing a CS stall, and don't already have one of the
+    * workaround bits set, add "Stall at Pixel Scoreboard."
+    */
+   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
+      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+}
+
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen == 7 && !brw->is_haswell) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         return 0;
+      }
+
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
+         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         return PIPE_CONTROL_CS_STALL;
+      }
+   }
+   return 0;
+}
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ *  - PIPE_CONTROL_WRITE_IMMEDIATE
+ *  - PIPE_CONTROL_WRITE_TIMESTAMP
+ *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                            drm_intel_bo *bo, uint32_t offset,
+                            uint32_t imm_lower, uint32_t imm_upper)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
+       * on later platforms.  We always use PPGTT on Gen7+.
+       */
+      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                gen6_gtt | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   }
+}
+
  /**
   * Restriction [DevSNB, DevIVB]:
   *
@@ -422,28 +604,11 @@ intel_batchbuffer_data(struct brw_context *brw,
  void
  intel_emit_depth_stall_flushes(struct brw_context *brw)
  {
-   assert(brw->gen >= 6 && brw->gen <= 7);
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   OUT_BATCH(PIPE_CONTROL_DEPTH_STALL);
-   OUT_BATCH(0); /* address */
-   OUT_BATCH(0); /* write data */
-   ADVANCE_BATCH()
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   OUT_BATCH(PIPE_CONTROL_DEPTH_CACHE_FLUSH);
-   OUT_BATCH(0); /* address */
-   OUT_BATCH(0); /* write data */
-   ADVANCE_BATCH();
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   OUT_BATCH(PIPE_CONTROL_DEPTH_STALL);
-   OUT_BATCH(0); /* address */
-   OUT_BATCH(0); /* write data */
-   ADVANCE_BATCH();
+   assert(brw->gen >= 6 && brw->gen <= 9);
+
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
  }
  
  /**
@@ -458,14 +623,11 @@ void
  gen7_emit_vs_workaround_flush(struct brw_context *brw)
  {
     assert(brw->gen == 7);
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   OUT_BATCH(PIPE_CONTROL_DEPTH_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
-   OUT_RELOC(brw->batch.workaround_bo,
-            I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
-   OUT_BATCH(0); /* write data */
-   ADVANCE_BATCH();
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_WRITE_IMMEDIATE
+                               | PIPE_CONTROL_DEPTH_STALL,
+                               brw->batch.workaround_bo, 0,
+                               0, 0);
  }
  
  
@@ -475,26 +637,11 @@ gen7_emit_vs_workaround_flush(struct brw_context *brw)
  void
  gen7_emit_cs_stall_flush(struct brw_context *brw)
  {
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: DW1[20]
-    * CS Stall):
-    *
-    *     One of the following must also be set:
-    *     - Render Target Cache Flush Enable ([12] of DW1)
-    *     - Depth Cache Flush Enable ([0] of DW1)
-    *     - Stall at Pixel Scoreboard ([1] of DW1)
-    *     - Depth Stall ([13] of DW1)
-    *     - Post-Sync Operation ([13] of DW1)
-    *
-    * We choose to do a Post-Sync Operation (Write Immediate Data), since
-    * it seems like it will incur the least additional performance penalty.
-    */
-   OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
-   OUT_RELOC(brw->batch.workaround_bo,
-             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
-   OUT_BATCH(0);
-   ADVANCE_BATCH();
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_CS_STALL
+                               | PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->batch.workaround_bo, 0,
+                               0, 0);
  }
  
  
@@ -538,26 +685,12 @@ gen7_emit_cs_stall_flush(struct brw_context *brw)
  void
  intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
  {
-   if (!brw->batch.need_workaround_flush)
-      return;
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_CS_STALL |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
  
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   OUT_BATCH(PIPE_CONTROL_CS_STALL |
-            PIPE_CONTROL_STALL_AT_SCOREBOARD);
-   OUT_BATCH(0); /* address */
-   OUT_BATCH(0); /* write data */
-   ADVANCE_BATCH();
-
-   BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-   OUT_BATCH(PIPE_CONTROL_WRITE_IMMEDIATE);
-   OUT_RELOC(brw->batch.workaround_bo,
-            I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
-   OUT_BATCH(0); /* write data */
-   ADVANCE_BATCH();
-
-   brw->batch.need_workaround_flush = false;
+   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->batch.workaround_bo, 0, 0, 0);
  }
  
  /* Emit a pipelined flush to either flush render and texture cache for
@@ -569,46 +702,95 @@ intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
  void
  intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
  {
-   if (brw->gen >= 6) {
-      if (brw->batch.ring == BLT_RING) {
-        BEGIN_BATCH_BLT(4);
-        OUT_BATCH(MI_FLUSH_DW);
-        OUT_BATCH(0);
-        OUT_BATCH(0);
-        OUT_BATCH(0);
-        ADVANCE_BATCH();
-      } else {
-        if (brw->gen == 6) {
-           /* Hardware workaround: SNB B-Spec says:
-            *
-            * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
-            * Flush Enable =1, a PIPE_CONTROL with any non-zero
-            * post-sync-op is required.
-            */
-           intel_emit_post_sync_nonzero_flush(brw);
-        }
+   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
+      BEGIN_BATCH_BLT(4);
+      OUT_BATCH(MI_FLUSH_DW);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
+      if (brw->gen >= 6) {
+         if (brw->gen == 9) {
+            /* Hardware workaround: SKL
+             *
+             * Emit Pipe Control with all bits set to zero before emitting
+             * a Pipe Control with VF Cache Invalidate set.
+             */
+            brw_emit_pipe_control_flush(brw, 0);
+         }
+
+         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
+                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                  PIPE_CONTROL_CS_STALL;
+
+         if (brw->gen == 6) {
+            /* Hardware workaround: SNB B-Spec says:
+             *
+             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+             * Flush Enable =1, a PIPE_CONTROL with any non-zero
+             * post-sync-op is required.
+             */
+            intel_emit_post_sync_nonzero_flush(brw);
+         }
+      }
+      brw_emit_pipe_control_flush(brw, flags);
+   }
  
-        BEGIN_BATCH(4);
-        OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
-        OUT_BATCH(PIPE_CONTROL_INSTRUCTION_FLUSH |
-                  PIPE_CONTROL_WRITE_FLUSH |
-                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                   PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                  PIPE_CONTROL_TC_FLUSH |
-                  PIPE_CONTROL_NO_WRITE |
-                   PIPE_CONTROL_CS_STALL);
-        OUT_BATCH(0); /* write address */
-        OUT_BATCH(0); /* write data */
-        ADVANCE_BATCH();
+   brw_render_cache_set_clear(brw);
+}
+
+static void
+load_sized_register_mem(struct brw_context *brw,
+                        uint32_t reg,
+                        drm_intel_bo *bo,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t offset,
+                        int size)
+{
+   int i;
+
+   /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
+   assert(brw->gen >= 7);
+
+   if (brw->gen >= 8) {
+      BEGIN_BATCH(4 * size);
+      for (i = 0; i < size; i++) {
+         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
+         OUT_BATCH(reg + i * 4);
+         OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
        }
+      ADVANCE_BATCH();
     } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
-               PIPE_CONTROL_WRITE_FLUSH |
-               PIPE_CONTROL_NO_WRITE);
-      OUT_BATCH(0); /* write address */
-      OUT_BATCH(0); /* write data */
-      OUT_BATCH(0); /* write data */
+      BEGIN_BATCH(3 * size);
+      for (i = 0; i < size; i++) {
+         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
+         OUT_BATCH(reg + i * 4);
+         OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
+      }
        ADVANCE_BATCH();
     }
  }
+
+void
+brw_load_register_mem(struct brw_context *brw,
+                      uint32_t reg,
+                      drm_intel_bo *bo,
+                      uint32_t read_domains, uint32_t write_domain,
+                      uint32_t offset)
+{
+   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
+}
+
+void
+brw_load_register_mem64(struct brw_context *brw,
+                        uint32_t reg,
+                        drm_intel_bo *bo,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t offset)
+{
+   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
+}