i965/state: Don't use brw->state.dirty.brw
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
index 818ac61cd5d3a8a2e9588b9c21a2d996798225e3..e522e4e9c1de942086f48d753c72d57c50f4616b 100644 (file)
@@ -33,6 +33,9 @@
 #include "intel_fbo.h"
 #include "brw_context.h"
 
+#include <xf86drm.h>
+#include <i915_drm.h>
+
 static void
 intel_batchbuffer_reset(struct brw_context *brw);
 
@@ -51,8 +54,6 @@ intel_batchbuffer_init(struct brw_context *brw)
                                                      4096, 4096);
    }
 
-   brw->batch.need_workaround_flush = true;
-
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
@@ -81,6 +82,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
    brw->batch.state_batch_offset = brw->batch.bo->size;
    brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
+   brw->batch.pipe_controls_since_last_cs_stall = 0;
 
    /* We don't know what ring the new batch will be sent to until we see the
     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
@@ -169,6 +171,7 @@ static void
 brw_new_batch(struct brw_context *brw)
 {
    /* Create a new batchbuffer and reset the associated state: */
+   drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
    intel_batchbuffer_reset(brw);
 
    /* If the kernel supports hardware contexts, then most hardware state is
@@ -178,14 +181,9 @@ brw_new_batch(struct brw_context *brw)
     * purposes means everything).
     */
    if (brw->hw_ctx == NULL)
-      brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 
-   brw->state.dirty.brw |= BRW_NEW_BATCH;
-
-   /* Assume that the last command before the start of our batch was a
-    * primitive, for safety.
-    */
-   brw->batch.need_workaround_flush = true;
+   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 
    brw->state_batch_count = 0;
 
@@ -224,12 +222,6 @@ brw_finish_batch(struct brw_context *brw)
    if (brw->batch.ring == RENDER_RING)
       brw_perf_monitor_finish_batch(brw);
 
-   if (brw->curbe.curbe_bo) {
-      drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo);
-      drm_intel_bo_unreference(brw->curbe.curbe_bo);
-      brw->curbe.curbe_bo = NULL;
-   }
-
    /* Mark that the current program cache BO has been used by the GPU.
     * It will be reallocated if we need to put new programs in for the
     * next batch.
@@ -237,6 +229,44 @@ brw_finish_batch(struct brw_context *brw)
    brw->cache.bo_used_by_gpu = true;
 }
 
+static void
+throttle(struct brw_context *brw)
+{
+   /* Wait for the swapbuffers before the one we just emitted, so we
+    * don't get too many swaps outstanding for apps that are GPU-heavy
+    * but not CPU-heavy.
+    *
+    * We're using intelDRI2Flush (called from the loader before
+    * swapbuffer) and glFlush (for front buffer rendering) as the
+    * indicator that a frame is done and then throttle when we get
+    * here as we prepare to render the next frame.  At this point for
+    * round trips for swap/copy and getting new buffers are done and
+    * we'll spend less time waiting on the GPU.
+    *
+    * Unfortunately, we don't have a handle to the batch containing
+    * the swap, and getting our hands on that doesn't seem worth it,
+    * so we just use the first batch we emitted after the last swap.
+    */
+   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
+      if (brw->throttle_batch[1]) {
+         if (!brw->disable_throttling)
+            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
+         drm_intel_bo_unreference(brw->throttle_batch[1]);
+      }
+      brw->throttle_batch[1] = brw->throttle_batch[0];
+      brw->throttle_batch[0] = NULL;
+      brw->need_swap_throttle = false;
+      /* Throttling here is more precise than the throttle ioctl, so skip it */
+      brw->need_flush_throttle = false;
+   }
+
+   if (brw->need_flush_throttle) {
+      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
+      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
+      brw->need_flush_throttle = false;
+   }
+}
+
 /* TODO: Push this whole function into bufmgr.
  */
 static int
@@ -271,6 +301,7 @@ do_flush_locked(struct brw_context *brw)
       if (ret == 0) {
          if (unlikely(INTEL_DEBUG & DEBUG_AUB))
             brw_annotate_aub(brw);
+
         if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
                                        flags);
@@ -279,6 +310,8 @@ do_flush_locked(struct brw_context *brw)
                                                4 * batch->used, flags);
         }
       }
+
+      throttle(brw);
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
@@ -301,9 +334,9 @@ _intel_batchbuffer_flush(struct brw_context *brw,
    if (brw->batch.used == 0)
       return 0;
 
-   if (brw->first_post_swapbuffers_batch == NULL) {
-      brw->first_post_swapbuffers_batch = brw->batch.bo;
-      drm_intel_bo_reference(brw->first_post_swapbuffers_batch);
+   if (brw->throttle_batch[0] == NULL) {
+      brw->throttle_batch[0] = brw->batch.bo;
+      drm_intel_bo_reference(brw->throttle_batch[0]);
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
@@ -363,10 +396,9 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw,
    assert(ret == 0);
    (void)ret;
 
-   /*
-    * Using the old buffer offset, write in what the right data would be, in case
-    * the buffer doesn't move and we can short-circuit the relocation processing
-    * in the kernel
+   /* Using the old buffer offset, write in what the right data would be, in
+    * case the buffer doesn't move and we can short-circuit the relocation
+    * processing in the kernel
     */
    intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
 
@@ -403,7 +435,7 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
    assert((bytes & 3) == 0);
    intel_batchbuffer_require_space(brw, bytes, ring);
-   __memcpy(brw->batch.map + brw->batch.used, data, bytes);
+   memcpy(brw->batch.map + brw->batch.used, data, bytes);
    brw->batch.used += bytes >> 2;
 }
 
@@ -424,7 +456,7 @@ intel_batchbuffer_data(struct brw_context *brw,
 static void
 gen8_add_cs_stall_workaround_bits(uint32_t *flags)
 {
-   uint32_t wa_bits = PIPE_CONTROL_WRITE_FLUSH |
+   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                       PIPE_CONTROL_WRITE_IMMEDIATE |
                       PIPE_CONTROL_WRITE_DEPTH_COUNT |
@@ -439,6 +471,33 @@ gen8_add_cs_stall_workaround_bits(uint32_t *flags)
       *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
 }
 
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen == 7 && !brw->is_haswell) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         return 0;
+      }
+
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
+         brw->batch.pipe_controls_since_last_cs_stall = 0;
+         return PIPE_CONTROL_CS_STALL;
+      }
+   }
+   return 0;
+}
+
 /**
  * Emit a PIPE_CONTROL with various flushing flags.
  *
@@ -460,6 +519,8 @@ brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
       BEGIN_BATCH(5);
       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
       OUT_BATCH(flags);
@@ -502,6 +563,8 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
       OUT_BATCH(imm_upper);
       ADVANCE_BATCH();
    } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
       /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
        * on later platforms.  We always use PPGTT on Gen7+.
        */
@@ -541,7 +604,7 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
 void
 intel_emit_depth_stall_flushes(struct brw_context *brw)
 {
-   assert(brw->gen >= 6 && brw->gen <= 8);
+   assert(brw->gen >= 6 && brw->gen <= 9);
 
    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
@@ -622,17 +685,12 @@ gen7_emit_cs_stall_flush(struct brw_context *brw)
 void
 intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
 {
-   if (!brw->batch.need_workaround_flush)
-      return;
-
    brw_emit_pipe_control_flush(brw,
                                PIPE_CONTROL_CS_STALL |
                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
 
    brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
                                brw->batch.workaround_bo, 0, 0, 0);
-
-   brw->batch.need_workaround_flush = false;
 }
 
 /* Emit a pipelined flush to either flush render and texture cache for
@@ -652,12 +710,21 @@ intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
-      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_WRITE_FLUSH;
+      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
       if (brw->gen >= 6) {
-         flags |= PIPE_CONTROL_INSTRUCTION_FLUSH |
+         if (brw->gen == 9) {
+            /* Hardware workaround: SKL
+             *
+             * Emit Pipe Control with all bits set to zero before emitting
+             * a Pipe Control with VF Cache Invalidate set.
+             */
+            brw_emit_pipe_control_flush(brw, 0);
+         }
+
+         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                   PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                  PIPE_CONTROL_TC_FLUSH |
+                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                   PIPE_CONTROL_CS_STALL;
 
          if (brw->gen == 6) {