#include "intel_reg.h"
#include "intel_bufmgr.h"
#include "intel_buffers.h"
+#include "intel_fbo.h"
#include "brw_context.h"
+#include <xf86drm.h>
+#include <i915_drm.h>
+
static void
intel_batchbuffer_reset(struct brw_context *brw);
-struct cached_batch_item {
- struct cached_batch_item *next;
- uint16_t header;
- uint16_t size;
-};
-
-void
-intel_batchbuffer_clear_cache(struct brw_context *brw)
-{
- struct cached_batch_item *item = brw->batch.cached_items;
-
- while (item) {
- struct cached_batch_item *next = item->next;
- free(item);
- item = next;
- }
-
- brw->batch.cached_items = NULL;
-}
-
void
intel_batchbuffer_init(struct brw_context *brw)
{
4096, 4096);
}
- brw->batch.need_workaround_flush = true;
-
if (!brw->has_llc) {
brw->batch.cpu_map = malloc(BATCH_SZ);
brw->batch.map = brw->batch.cpu_map;
}
brw->batch.last_bo = brw->batch.bo;
- intel_batchbuffer_clear_cache(brw);
+ brw_render_cache_set_clear(brw);
brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
BATCH_SZ, 4096);
brw->batch.state_batch_offset = brw->batch.bo->size;
brw->batch.used = 0;
brw->batch.needs_sol_reset = false;
+ brw->batch.pipe_controls_since_last_cs_stall = 0;
/* We don't know what ring the new batch will be sent to until we see the
* first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
brw->batch.used = brw->batch.saved.used;
if (brw->batch.used == 0)
brw->batch.ring = UNKNOWN_RING;
-
- /* Cached batch state is dead, since we just cleared some unknown part of the
- * batchbuffer. Assume that the caller resets any other state necessary.
- */
- intel_batchbuffer_clear_cache(brw);
}
void
drm_intel_bo_unreference(brw->batch.last_bo);
drm_intel_bo_unreference(brw->batch.bo);
drm_intel_bo_unreference(brw->batch.workaround_bo);
- intel_batchbuffer_clear_cache(brw);
}
static void
batch->used);
}
+ drm_intel_decode_set_output_file(decode, stderr);
drm_intel_decode(decode);
drm_intel_decode_context_free(decode);
brw_new_batch(struct brw_context *brw)
{
/* Create a new batchbuffer and reset the associated state: */
+ drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
intel_batchbuffer_reset(brw);
/* If the kernel supports hardware contexts, then most hardware state is
brw->state.dirty.brw |= BRW_NEW_BATCH;
- /* Assume that the last command before the start of our batch was a
- * primitive, for safety.
- */
- brw->batch.need_workaround_flush = true;
-
brw->state_batch_count = 0;
brw->ib.type = -1;
if (brw->batch.ring == RENDER_RING)
brw_perf_monitor_finish_batch(brw);
- if (brw->curbe.curbe_bo) {
- drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo);
- drm_intel_bo_unreference(brw->curbe.curbe_bo);
- brw->curbe.curbe_bo = NULL;
- }
-
/* Mark that the current program cache BO has been used by the GPU.
* It will be reallocated if we need to put new programs in for the
* next batch.
brw->cache.bo_used_by_gpu = true;
}
+static void
+throttle(struct brw_context *brw)
+{
+ /* Wait for the swapbuffers before the one we just emitted, so we
+ * don't get too many swaps outstanding for apps that are GPU-heavy
+ * but not CPU-heavy.
+ *
+ * We're using intelDRI2Flush (called from the loader before
+ * swapbuffer) and glFlush (for front buffer rendering) as the
+ * indicator that a frame is done and then throttle when we get
+ * here as we prepare to render the next frame. At this point for
+ * round trips for swap/copy and getting new buffers are done and
+ * we'll spend less time waiting on the GPU.
+ *
+ * Unfortunately, we don't have a handle to the batch containing
+ * the swap, and getting our hands on that doesn't seem worth it,
+ * so we just use the first batch we emitted after the last swap.
+ */
+ if (brw->need_swap_throttle && brw->throttle_batch[0]) {
+ if (brw->throttle_batch[1]) {
+ if (!brw->disable_throttling)
+ drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
+ drm_intel_bo_unreference(brw->throttle_batch[1]);
+ }
+ brw->throttle_batch[1] = brw->throttle_batch[0];
+ brw->throttle_batch[0] = NULL;
+ brw->need_swap_throttle = false;
+ /* Throttling here is more precise than the throttle ioctl, so skip it */
+ brw->need_flush_throttle = false;
+ }
+
+ if (brw->need_flush_throttle) {
+ __DRIscreen *psp = brw->intelScreen->driScrnPriv;
+ drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
+ brw->need_flush_throttle = false;
+ }
+}
+
/* TODO: Push this whole function into bufmgr.
*/
static int
if (ret == 0) {
if (unlikely(INTEL_DEBUG & DEBUG_AUB))
brw_annotate_aub(brw);
+
if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
flags);
4 * batch->used, flags);
}
}
+
+ throttle(brw);
}
if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
if (brw->batch.used == 0)
return 0;
- if (brw->first_post_swapbuffers_batch == NULL) {
- brw->first_post_swapbuffers_batch = brw->batch.bo;
- drm_intel_bo_reference(brw->first_post_swapbuffers_batch);
+ if (brw->throttle_batch[0] == NULL) {
+ brw->throttle_batch[0] = brw->batch.bo;
+ drm_intel_bo_reference(brw->throttle_batch[0]);
}
if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
assert(ret == 0);
(void)ret;
- /*
- * Using the old buffer offset, write in what the right data would be, in case
- * the buffer doesn't move and we can short-circuit the relocation processing
- * in the kernel
+ /* Using the old buffer offset, write in what the right data would be, in
+ * case the buffer doesn't move and we can short-circuit the relocation
+ * processing in the kernel
*/
intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
{
assert((bytes & 3) == 0);
intel_batchbuffer_require_space(brw, bytes, ring);
- __memcpy(brw->batch.map + brw->batch.used, data, bytes);
+ memcpy(brw->batch.map + brw->batch.used, data, bytes);
brw->batch.used += bytes >> 2;
}
+/**
+ * According to the latest documentation, any PIPE_CONTROL with the
+ * "Command Streamer Stall" bit set must also have another bit set,
+ * with five different options:
+ *
+ * - Render Target Cache Flush
+ * - Depth Cache Flush
+ * - Stall at Pixel Scoreboard
+ * - Post-Sync Operation
+ * - Depth Stall
+ *
+ * I chose "Stall at Pixel Scoreboard" since we've used it effectively
+ * in the past, but the choice is fairly arbitrary.
+ */
+static void
+gen8_add_cs_stall_workaround_bits(uint32_t *flags)
+{
+ uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_WRITE_IMMEDIATE |
+ PIPE_CONTROL_WRITE_DEPTH_COUNT |
+ PIPE_CONTROL_WRITE_TIMESTAMP |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD |
+ PIPE_CONTROL_DEPTH_STALL;
+
+ /* If we're doing a CS stall, and don't already have one of the
+ * workaround bits set, add "Stall at Pixel Scoreboard."
+ */
+ if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
+ *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+}
+
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+ if (brw->gen == 7 && !brw->is_haswell) {
+ if (flags & PIPE_CONTROL_CS_STALL) {
+ /* If we're doing a CS stall, reset the counter and carry on. */
+ brw->batch.pipe_controls_since_last_cs_stall = 0;
+ return 0;
+ }
+
+ /* If this is the fourth pipe control without a CS stall, do one now. */
+ if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
+ brw->batch.pipe_controls_since_last_cs_stall = 0;
+ return PIPE_CONTROL_CS_STALL;
+ }
+ }
+ return 0;
+}
+
/**
* Emit a PIPE_CONTROL with various flushing flags.
*
brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
{
if (brw->gen >= 8) {
+ gen8_add_cs_stall_workaround_bits(&flags);
+
BEGIN_BATCH(6);
OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
OUT_BATCH(flags);
OUT_BATCH(0);
ADVANCE_BATCH();
} else if (brw->gen >= 6) {
+ flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
BEGIN_BATCH(5);
OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
OUT_BATCH(flags);
uint32_t imm_lower, uint32_t imm_upper)
{
if (brw->gen >= 8) {
+ gen8_add_cs_stall_workaround_bits(&flags);
+
BEGIN_BATCH(6);
OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
OUT_BATCH(flags);
OUT_BATCH(imm_upper);
ADVANCE_BATCH();
} else if (brw->gen >= 6) {
+ flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
/* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
* on later platforms. We always use PPGTT on Gen7+.
*/
void
intel_emit_depth_stall_flushes(struct brw_context *brw)
{
- assert(brw->gen >= 6 && brw->gen <= 7);
+ assert(brw->gen >= 6 && brw->gen <= 9);
brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
void
intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
{
- if (!brw->batch.need_workaround_flush)
- return;
-
brw_emit_pipe_control_flush(brw,
PIPE_CONTROL_CS_STALL |
PIPE_CONTROL_STALL_AT_SCOREBOARD);
brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
brw->batch.workaround_bo, 0, 0, 0);
-
- brw->batch.need_workaround_flush = false;
}
/* Emit a pipelined flush to either flush render and texture cache for
OUT_BATCH(0);
ADVANCE_BATCH();
} else {
- int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_WRITE_FLUSH;
+ int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
if (brw->gen >= 6) {
- flags |= PIPE_CONTROL_INSTRUCTION_FLUSH |
+ if (brw->gen == 9) {
+ /* Hardware workaround: SKL
+ *
+ * Emit Pipe Control with all bits set to zero before emitting
+ * a Pipe Control with VF Cache Invalidate set.
+ */
+ brw_emit_pipe_control_flush(brw, 0);
+ }
+
+ flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_VF_CACHE_INVALIDATE |
- PIPE_CONTROL_TC_FLUSH |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_CS_STALL;
if (brw->gen == 6) {
}
brw_emit_pipe_control_flush(brw, flags);
}
+
+ brw_render_cache_set_clear(brw);
+}
+
+void
+brw_load_register_mem(struct brw_context *brw,
+ uint32_t reg,
+ drm_intel_bo *bo,
+ uint32_t read_domains, uint32_t write_domain,
+ uint32_t offset)
+{
+ /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
+ assert(brw->gen >= 7);
+
+ if (brw->gen >= 8) {
+ BEGIN_BATCH(4);
+ OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
+ OUT_BATCH(reg);
+ OUT_RELOC64(bo, read_domains, write_domain, offset);
+ ADVANCE_BATCH();
+ } else {
+ BEGIN_BATCH(3);
+ OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
+ OUT_BATCH(reg);
+ OUT_RELOC(bo, read_domains, write_domain, offset);
+ ADVANCE_BATCH();
+ }
}