#include "intel_fbo.h"
#include "brw_context.h"
+#include <xf86drm.h>
+#include <i915_drm.h>
+
static void
intel_batchbuffer_reset(struct brw_context *brw);
4096, 4096);
}
- brw->batch.need_workaround_flush = true;
-
if (!brw->has_llc) {
brw->batch.cpu_map = malloc(BATCH_SZ);
brw->batch.map = brw->batch.cpu_map;
brw->batch.state_batch_offset = brw->batch.bo->size;
brw->batch.used = 0;
brw->batch.needs_sol_reset = false;
+ brw->batch.pipe_controls_since_last_cs_stall = 0;
/* We don't know what ring the new batch will be sent to until we see the
* first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
brw_new_batch(struct brw_context *brw)
{
/* Create a new batchbuffer and reset the associated state: */
+ drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
intel_batchbuffer_reset(brw);
/* If the kernel supports hardware contexts, then most hardware state is
brw->state.dirty.brw |= BRW_NEW_BATCH;
- /* Assume that the last command before the start of our batch was a
- * primitive, for safety.
- */
- brw->batch.need_workaround_flush = true;
-
brw->state_batch_count = 0;
brw->ib.type = -1;
brw->cache.bo_used_by_gpu = true;
}
+static void
+throttle(struct brw_context *brw)
+{
+ /* Wait for the swapbuffers before the one we just emitted, so we
+ * don't get too many swaps outstanding for apps that are GPU-heavy
+ * but not CPU-heavy.
+ *
+ * We're using intelDRI2Flush (called from the loader before
+ * swapbuffer) and glFlush (for front buffer rendering) as the
+ * indicator that a frame is done and then throttle when we get
+ * here as we prepare to render the next frame. At this point for
+ * round trips for swap/copy and getting new buffers are done and
+ * we'll spend less time waiting on the GPU.
+ *
+ * Unfortunately, we don't have a handle to the batch containing
+ * the swap, and getting our hands on that doesn't seem worth it,
+ * so we just use the first batch we emitted after the last swap.
+ */
+ if (brw->need_swap_throttle && brw->throttle_batch[0]) {
+ if (brw->throttle_batch[1]) {
+ if (!brw->disable_throttling)
+ drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
+ drm_intel_bo_unreference(brw->throttle_batch[1]);
+ }
+ brw->throttle_batch[1] = brw->throttle_batch[0];
+ brw->throttle_batch[0] = NULL;
+ brw->need_swap_throttle = false;
+ /* Throttling here is more precise than the throttle ioctl, so skip it */
+ brw->need_flush_throttle = false;
+ }
+
+ if (brw->need_flush_throttle) {
+ __DRIscreen *psp = brw->intelScreen->driScrnPriv;
+ drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
+ brw->need_flush_throttle = false;
+ }
+}
+
/* TODO: Push this whole function into bufmgr.
*/
static int
if (ret == 0) {
if (unlikely(INTEL_DEBUG & DEBUG_AUB))
brw_annotate_aub(brw);
+
if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
flags);
4 * batch->used, flags);
}
}
+
+ throttle(brw);
}
if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
if (brw->batch.used == 0)
return 0;
- if (brw->first_post_swapbuffers_batch == NULL) {
- brw->first_post_swapbuffers_batch = brw->batch.bo;
- drm_intel_bo_reference(brw->first_post_swapbuffers_batch);
+ if (brw->throttle_batch[0] == NULL) {
+ brw->throttle_batch[0] = brw->batch.bo;
+ drm_intel_bo_reference(brw->throttle_batch[0]);
}
if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
assert(ret == 0);
(void)ret;
- /*
- * Using the old buffer offset, write in what the right data would be, in case
- * the buffer doesn't move and we can short-circuit the relocation processing
- * in the kernel
+ /* Using the old buffer offset, write in what the right data would be, in
+ * case the buffer doesn't move and we can short-circuit the relocation
+ * processing in the kernel
*/
intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
{
assert((bytes & 3) == 0);
intel_batchbuffer_require_space(brw, bytes, ring);
- __memcpy(brw->batch.map + brw->batch.used, data, bytes);
+ memcpy(brw->batch.map + brw->batch.used, data, bytes);
brw->batch.used += bytes >> 2;
}
static void
gen8_add_cs_stall_workaround_bits(uint32_t *flags)
{
- uint32_t wa_bits = PIPE_CONTROL_WRITE_FLUSH |
+ uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_WRITE_IMMEDIATE |
PIPE_CONTROL_WRITE_DEPTH_COUNT |
*flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
}
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+ if (brw->gen == 7 && !brw->is_haswell) {
+ if (flags & PIPE_CONTROL_CS_STALL) {
+ /* If we're doing a CS stall, reset the counter and carry on. */
+ brw->batch.pipe_controls_since_last_cs_stall = 0;
+ return 0;
+ }
+
+ /* If this is the fourth pipe control without a CS stall, do one now. */
+ if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
+ brw->batch.pipe_controls_since_last_cs_stall = 0;
+ return PIPE_CONTROL_CS_STALL;
+ }
+ }
+ return 0;
+}
+
/**
* Emit a PIPE_CONTROL with various flushing flags.
*
OUT_BATCH(0);
ADVANCE_BATCH();
} else if (brw->gen >= 6) {
+ flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
BEGIN_BATCH(5);
OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
OUT_BATCH(flags);
OUT_BATCH(imm_upper);
ADVANCE_BATCH();
} else if (brw->gen >= 6) {
+ flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
/* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
* on later platforms. We always use PPGTT on Gen7+.
*/
void
intel_emit_depth_stall_flushes(struct brw_context *brw)
{
- assert(brw->gen >= 6 && brw->gen <= 8);
+ assert(brw->gen >= 6 && brw->gen <= 9);
brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
void
intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
{
- if (!brw->batch.need_workaround_flush)
- return;
-
brw_emit_pipe_control_flush(brw,
PIPE_CONTROL_CS_STALL |
PIPE_CONTROL_STALL_AT_SCOREBOARD);
brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
brw->batch.workaround_bo, 0, 0, 0);
-
- brw->batch.need_workaround_flush = false;
}
/* Emit a pipelined flush to either flush render and texture cache for
OUT_BATCH(0);
ADVANCE_BATCH();
} else {
- int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_WRITE_FLUSH;
+ int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
if (brw->gen >= 6) {
- flags |= PIPE_CONTROL_INSTRUCTION_FLUSH |
+ if (brw->gen == 9) {
+ /* Hardware workaround: SKL
+ *
+ * Emit Pipe Control with all bits set to zero before emitting
+ * a Pipe Control with VF Cache Invalidate set.
+ */
+ brw_emit_pipe_control_flush(brw, 0);
+ }
+
+ flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_VF_CACHE_INVALIDATE |
- PIPE_CONTROL_TC_FLUSH |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_CS_STALL;
if (brw->gen == 6) {