struct brw_bo *bo;
/** Last BO submitted to the hardware. Used for glFinish(). */
struct brw_bo *last_bo;
+ /** Current statebuffer being queued up. */
+ struct brw_bo *state_bo;
#ifdef DEBUG
uint16_t emit, total;
uint16_t reserved_space;
uint32_t *map_next;
uint32_t *map;
- uint32_t *cpu_map;
+ uint32_t *batch_cpu_map;
+ uint32_t *state_cpu_map;
+ uint32_t *state_map;
+ uint32_t state_used;
- uint32_t state_batch_offset;
enum brw_gpu_ring ring;
bool use_batch_first;
bool needs_sol_reset;
bool state_base_address_emitted;
struct brw_reloc_list batch_relocs;
+ struct brw_reloc_list state_relocs;
unsigned int valid_reloc_flags;
/** The validation list */
struct {
uint32_t *map_next;
int batch_reloc_count;
+ int state_reloc_count;
int exec_count;
} saved;
BEGIN_BATCH(7);
OUT_BATCH(_3DSTATE_PIPELINED_POINTERS << 16 | (7 - 2));
- OUT_RELOC(brw->batch.bo, 0, brw->vs.base.state_offset);
+ OUT_RELOC(brw->batch.state_bo, 0, brw->vs.base.state_offset);
if (brw->ff_gs.prog_active)
- OUT_RELOC(brw->batch.bo, 0, brw->ff_gs.state_offset | 1);
+ OUT_RELOC(brw->batch.state_bo, 0, brw->ff_gs.state_offset | 1);
else
OUT_BATCH(0);
- OUT_RELOC(brw->batch.bo, 0, brw->clip.state_offset | 1);
- OUT_RELOC(brw->batch.bo, 0, brw->sf.state_offset);
- OUT_RELOC(brw->batch.bo, 0, brw->wm.base.state_offset);
- OUT_RELOC(brw->batch.bo, 0, brw->cc.state_offset);
+ OUT_RELOC(brw->batch.state_bo, 0, brw->clip.state_offset | 1);
+ OUT_RELOC(brw->batch.state_bo, 0, brw->sf.state_offset);
+ OUT_RELOC(brw->batch.state_bo, 0, brw->wm.base.state_offset);
+ OUT_RELOC(brw->batch.state_bo, 0, brw->cc.state_offset);
ADVANCE_BATCH();
brw->ctx.NewDriverState |= BRW_NEW_PSP;
OUT_BATCH(0);
OUT_BATCH(mocs_wb << 16);
/* Surface state base address: */
- OUT_RELOC64(brw->batch.bo, 0, mocs_wb << 4 | 1);
+ OUT_RELOC64(brw->batch.state_bo, 0, mocs_wb << 4 | 1);
/* Dynamic state base address: */
- OUT_RELOC64(brw->batch.bo, 0, mocs_wb << 4 | 1);
+ OUT_RELOC64(brw->batch.state_bo, 0, mocs_wb << 4 | 1);
/* Indirect object base address: MEDIA_OBJECT data */
OUT_BATCH(mocs_wb << 4 | 1);
OUT_BATCH(0);
/* General state buffer size */
OUT_BATCH(0xfffff001);
/* Dynamic state buffer size */
- OUT_BATCH(ALIGN(brw->batch.bo->size, 4096) | 1);
+ OUT_BATCH(ALIGN(brw->batch.state_bo->size, 4096) | 1);
/* Indirect object upper bound */
OUT_BATCH(0xfffff001);
/* Instruction access upper bound */
* BINDING_TABLE_STATE
* SURFACE_STATE
*/
- OUT_RELOC(brw->batch.bo, 0, 1);
+ OUT_RELOC(brw->batch.state_bo, 0, 1);
/* Dynamic state base address:
* SAMPLER_STATE
* SAMPLER_BORDER_COLOR_STATE
* Push constants (when INSTPM: CONSTANT_BUFFER Address Offset
* Disable is clear, which we rely on)
*/
- OUT_RELOC(brw->batch.bo, 0, 1);
+ OUT_RELOC(brw->batch.state_bo, 0, 1);
OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
BEGIN_BATCH(8);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
OUT_BATCH(1); /* General state base address */
- OUT_RELOC(brw->batch.bo, 0, 1); /* Surface state base address */
+ OUT_RELOC(brw->batch.state_bo, 0, 1); /* Surface state base address */
OUT_BATCH(1); /* Indirect object base address */
OUT_RELOC(brw->cache.bo, 0, 1); /* Instruction base address */
OUT_BATCH(0xfffff001); /* General state upper bound */
BEGIN_BATCH(6);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
OUT_BATCH(1); /* General state base address */
- OUT_RELOC(brw->batch.bo, 0, 1); /* Surface state base address */
+ OUT_RELOC(brw->batch.state_bo, 0, 1); /* Surface state base address */
OUT_BATCH(1); /* Indirect object base address */
OUT_BATCH(1); /* General state upper bound */
OUT_BATCH(1); /* Indirect object upper bound */
struct brw_context *brw = batch->driver_batch;
return (struct blorp_address) {
- .buffer = brw->batch.bo,
+ .buffer = brw->batch.state_bo,
.offset = offset,
};
}
{
assert(batch->blorp->driver_ctx == batch->driver_batch);
struct brw_context *brw = batch->driver_batch;
+ uint32_t offset;
+
+ if (GEN_GEN < 6 && brw_ptr_in_state_buffer(&brw->batch, location)) {
+ offset = (char *)location - (char *)brw->batch.state_map;
+ return brw_state_reloc(&brw->batch, offset,
+ address.buffer, address.offset + delta,
+ address.reloc_flags);
+ }
+
+ assert(!brw_ptr_in_state_buffer(&brw->batch, location));
- uint32_t offset = (char *)location - (char *)brw->batch.map;
+ offset = (char *)location - (char *)brw->batch.map;
return brw_batch_reloc(&brw->batch, offset,
address.buffer, address.offset + delta,
address.reloc_flags);
brw_state_reloc(&brw->batch, ss_offset, bo, address.offset + delta,
address.reloc_flags);
- void *reloc_ptr = (void *)brw->batch.map + ss_offset;
+ void *reloc_ptr = (void *)brw->batch.state_map + ss_offset;
#if GEN_GEN >= 8
*(uint64_t *)reloc_ptr = reloc_val;
#else
void *data = brw_state_batch(brw, size, 64, &offset);
*addr = (struct blorp_address) {
- .buffer = brw->batch.bo,
+ .buffer = brw->batch.state_bo,
.offset = offset,
};
intel_batchbuffer_save_state(brw);
struct brw_bo *saved_bo = brw->batch.bo;
uint32_t saved_used = USED_BATCH(brw->batch);
- uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
+ uint32_t saved_state_used = brw->batch.state_used;
#if GEN_GEN == 6
/* Emit workaround flushes when we switch from drawing to blorping. */
*/
assert(brw->batch.bo == saved_bo);
assert((USED_BATCH(brw->batch) - saved_used) * 4 +
- (saved_state_batch_offset - brw->batch.state_batch_offset) <
+ (brw->batch.state_used - saved_state_used) <
estimated_max_batch_usage);
/* Shut up compiler warnings on release build */
(void)saved_bo;
(void)saved_used;
- (void)saved_state_batch_offset;
+ (void)saved_state_used;
/* Check if the blorp op we just did would make our batch likely to fail to
* map all the BOs into the GPU at batch exec time later. If so, flush the
__gen_combine_address(struct brw_context *brw, void *location,
struct brw_address address, uint32_t delta)
{
+ struct intel_batchbuffer *batch = &brw->batch;
+ uint32_t offset;
+
if (address.bo == NULL) {
return address.offset + delta;
} else {
- uint32_t offset = (char *) location - (char *) brw->batch.map;
+ if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
+ offset = (char *) location - (char *) brw->batch.state_map;
+ return brw_state_reloc(batch, offset, address.bo,
+ address.offset + delta,
+ address.reloc_flags);
+ }
+
+ assert(!brw_ptr_in_state_buffer(batch, location));
- /* TODO: Use brw_state_reloc for some things on Gen4-5 */
- return brw_batch_reloc(&brw->batch, offset, address.bo,
+ offset = (char *) location - (char *) brw->batch.map;
+ return brw_batch_reloc(batch, offset, address.bo,
address.offset + delta,
address.reloc_flags);
}
clip.GuardbandClipTestEnable = true;
clip.ClipperViewportStatePointer =
- ro_bo(brw->batch.bo, brw->clip.vp_offset);
+ ro_bo(brw->batch.state_bo, brw->clip.vp_offset);
clip.ScreenSpaceViewportXMin = -1;
clip.ScreenSpaceViewportXMax = 1;
* something loaded through the GPE (L2 ISC), so it's INSTRUCTION
* domain.
*/
- sf.SetupViewportStateOffset = ro_bo(brw->batch.bo, brw->sf.vp_offset);
+ sf.SetupViewportStateOffset =
+ ro_bo(brw->batch.state_bo, brw->sf.vp_offset);
sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
if (stage_state->sampler_count)
wm.SamplerStatePointer =
- ro_bo(brw->batch.bo, stage_state->sampler_offset);
+ ro_bo(brw->batch.state_bo, stage_state->sampler_offset);
#if GEN_GEN == 5
if (wm_prog_data->prog_offset_2)
wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
vs.StatisticsEnable = false;
vs.SamplerStatePointer =
- ro_bo(brw->batch.bo, stage_state->sampler_offset);
+ ro_bo(brw->batch.state_bo, stage_state->sampler_offset);
#endif
#if GEN_GEN == 5
cc.StatisticsEnable = brw->stats_wm;
- cc.CCViewportStatePointer = ro_bo(brw->batch.bo, brw->cc.vp_offset);
+ cc.CCViewportStatePointer =
+ ro_bo(brw->batch.state_bo, brw->cc.vp_offset);
#else
/* _NEW_COLOR */
cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
&border_color_offset);
}
#if GEN_GEN < 6
- samp_st.BorderColorPointer = ro_bo(brw->batch.bo, border_color_offset);
+ samp_st.BorderColorPointer =
+ ro_bo(brw->batch.state_bo, border_color_offset);
#else
samp_st.BorderColorPointer = border_color_offset;
#endif
#define FILE_DEBUG_FLAG DEBUG_BUFMGR
#define BATCH_SZ (8192*sizeof(uint32_t))
+#define STATE_SZ (8192*sizeof(uint32_t))
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
const struct gen_device_info *devinfo = &screen->devinfo;
if (!devinfo->has_llc) {
- batch->cpu_map = malloc(BATCH_SZ);
- batch->map = batch->cpu_map;
- batch->map_next = batch->cpu_map;
+ batch->batch_cpu_map = malloc(BATCH_SZ);
+ batch->map = batch->batch_cpu_map;
+ batch->map_next = batch->map;
+ batch->state_cpu_map = malloc(STATE_SZ);
+ batch->state_map = batch->state_cpu_map;
}
init_reloc_list(&batch->batch_relocs, 250);
+ init_reloc_list(&batch->state_relocs, 250);
batch->exec_count = 0;
batch->exec_array_size = 100;
batch->last_bo = batch->bo;
batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
- if (devinfo->has_llc) {
+ if (!batch->batch_cpu_map) {
batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
}
batch->map_next = batch->map;
+ batch->state_bo = brw_bo_alloc(bufmgr, "statebuffer", STATE_SZ, 4096);
+ batch->state_bo->kflags =
+ can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
+ if (!batch->state_cpu_map) {
+ batch->state_map =
+ brw_bo_map(NULL, batch->state_bo, MAP_READ | MAP_WRITE);
+ }
+
+ /* Avoid making 0 a valid state offset - otherwise the decoder will try
+ * and decode data when we use offset 0 as a null pointer.
+ */
+ batch->state_used = 1;
+
add_exec_bo(batch, batch->bo);
assert(batch->bo->index == 0);
batch->reserved_space = BATCH_RESERVED;
- batch->state_batch_offset = batch->bo->size;
batch->needs_sol_reset = false;
batch->state_base_address_emitted = false;
{
brw->batch.saved.map_next = brw->batch.map_next;
brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
+ brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
brw->batch.saved.exec_count = brw->batch.exec_count;
}
brw_bo_unreference(brw->batch.exec_bos[i]);
}
brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
+ brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
brw->batch.exec_count = brw->batch.saved.exec_count;
brw->batch.map_next = brw->batch.saved.map_next;
void
intel_batchbuffer_free(struct intel_batchbuffer *batch)
{
- free(batch->cpu_map);
+ free(batch->batch_cpu_map);
+ free(batch->state_cpu_map);
for (int i = 0; i < batch->exec_count; i++) {
brw_bo_unreference(batch->exec_bos[i]);
}
free(batch->batch_relocs.relocs);
+ free(batch->state_relocs.relocs);
free(batch->exec_bos);
free(batch->validation_list);
brw_bo_unreference(batch->last_bo);
brw_bo_unreference(batch->bo);
+ brw_bo_unreference(batch->state_bo);
if (batch->state_batch_sizes)
_mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
}
enum brw_gpu_ring ring)
{
const struct gen_device_info *devinfo = &brw->screen->devinfo;
+ struct intel_batchbuffer *batch = &brw->batch;
/* If we're switching rings, implicitly flush the batch. */
if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
intel_batchbuffer_flush(brw);
}
- if (intel_batchbuffer_space(&brw->batch) < sz)
+ /* For now, flush as if the batch and state buffers still shared a BO */
+ if (USED_BATCH(*batch) * 4 + sz >=
+ BATCH_SZ - batch->reserved_space - batch->state_used)
intel_batchbuffer_flush(brw);
/* The intel_batchbuffer_flush() calls above might have changed
return;
uint32_t *batch_data = brw_bo_map(brw, batch->bo, MAP_READ);
- uint32_t *state = batch_data;
+ uint32_t *state = brw_bo_map(brw, batch->state_bo, MAP_READ);
if (batch == NULL || state == NULL) {
fprintf(stderr, "WARNING: failed to map batchbuffer/statebuffer\n");
return;
uint32_t *end = batch_data + USED_BATCH(*batch);
uint32_t batch_gtt_offset = batch->bo->gtt_offset;
- uint32_t state_gtt_offset = batch->bo->gtt_offset;
+ uint32_t state_gtt_offset = batch->state_bo->gtt_offset;
int length;
bool color = INTEL_DEBUG & DEBUG_COLOR;
}
brw_bo_unmap(batch->bo);
+ brw_bo_unmap(batch->state_bo);
}
#else
static void do_batch_dump(struct brw_context *brw) { }
brw->batch.exec_bos[i] = NULL;
}
brw->batch.batch_relocs.reloc_count = 0;
+ brw->batch.state_relocs.reloc_count = 0;
brw->batch.exec_count = 0;
brw->batch.aperture_space = 0;
+ brw_bo_unreference(brw->batch.state_bo);
+
/* Create a new batchbuffer and reset the associated state: */
intel_batchbuffer_reset_and_clear_render_cache(brw);
struct intel_batchbuffer *batch = &brw->batch;
int ret = 0;
- if (batch->cpu_map) {
+ if (batch->batch_cpu_map) {
void *bo_map = brw_bo_map(brw, batch->bo, MAP_WRITE);
- memcpy(bo_map, batch->cpu_map, 4 * USED_BATCH(*batch));
- memcpy(bo_map + batch->state_batch_offset,
- (char *) batch->cpu_map + batch->state_batch_offset,
- batch->bo->size - batch->state_batch_offset);
+ memcpy(bo_map, batch->batch_cpu_map, 4 * USED_BATCH(*batch));
+ }
+
+ if (batch->state_cpu_map) {
+ void *bo_map = brw_bo_map(brw, batch->state_bo, MAP_WRITE);
+ memcpy(bo_map, batch->state_cpu_map, batch->state_used);
}
brw_bo_unmap(batch->bo);
+ brw_bo_unmap(batch->state_bo);
if (!brw->screen->no_hw) {
/* The requirement for using I915_EXEC_NO_RELOC are:
uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
+ /* Set statebuffer relocations */
+ const unsigned state_index = batch->state_bo->index;
+ if (state_index < batch->exec_count &&
+ batch->exec_bos[state_index] == batch->state_bo) {
+ struct drm_i915_gem_exec_object2 *entry =
+ &batch->validation_list[state_index];
+ assert(entry->handle == batch->state_bo->gem_handle);
+ entry->relocation_count = batch->state_relocs.reloc_count;
+ entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
+ }
+
+ /* Set batchbuffer relocations */
struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
assert(entry->handle == batch->bo->gem_handle);
entry->relocation_count = batch->batch_relocs.reloc_count;
if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
int bytes_for_commands = 4 * USED_BATCH(brw->batch);
- int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
- int total_bytes = bytes_for_commands + bytes_for_state;
- fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
- "%4db (state) = %4db (%0.1f%%)\n", file, line,
- bytes_for_commands, bytes_for_state,
- total_bytes,
- 100.0f * total_bytes / BATCH_SZ);
+ int bytes_for_state = brw->batch.state_used;
+ fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (%0.1f%%) (pkt) + "
+ "%4db (%0.1f%%) (state)\n", file, line,
+ bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
+ bytes_for_state, 100.0f * bytes_for_state / STATE_SZ);
}
brw->batch.reserved_space = 0;
struct brw_bo *target, uint32_t target_offset,
unsigned int reloc_flags)
{
- assert(state_offset <= batch->bo->size - sizeof(uint32_t));
+ assert(state_offset <= batch->state_bo->size - sizeof(uint32_t));
- return emit_reloc(batch, &batch->batch_relocs, state_offset,
+ return emit_reloc(batch, &batch->state_relocs, state_offset,
target, target_offset, reloc_flags);
}
uint32_t *out_offset)
{
struct intel_batchbuffer *batch = &brw->batch;
- uint32_t offset;
assert(size < batch->bo->size);
- offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
- /* If allocating from the top would wrap below the batchbuffer, or
- * if the batch's used space (plus the reserved pad) collides with our
- * space, then flush and try again.
- */
- if (batch->state_batch_offset < size ||
- offset < 4 * USED_BATCH(*batch) + batch->reserved_space) {
+ uint32_t offset = ALIGN(batch->state_used, alignment);
+
+ /* For now, follow the old flushing behavior. */
+ int batch_space = batch->reserved_space + USED_BATCH(*batch) * 4;
+
+ if (offset + size >= STATE_SZ - batch_space) {
intel_batchbuffer_flush(brw);
- offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
+ offset = ALIGN(batch->state_used, alignment);
}
- batch->state_batch_offset = offset;
-
if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
_mesa_hash_table_insert(batch->state_batch_sizes,
(void *) (uintptr_t) offset,
(void *) (uintptr_t) size);
}
+ batch->state_used = offset + size;
+
*out_offset = offset;
- return batch->map + (offset>>2);
+ return batch->state_map + (offset >> 2);
}
void
return fi.d;
}
-/* Inline functions - might actually be better off with these
- * non-inlined. Certainly better off switching all command packets to
- * be passed as structs rather than dwords, but that's a little bit of
- * work...
- */
-static inline unsigned
-intel_batchbuffer_space(struct intel_batchbuffer *batch)
-{
- return (batch->state_batch_offset - batch->reserved_space)
- - USED_BATCH(*batch) * 4;
-}
-
-
static inline void
intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
{
-#ifdef DEBUG
- assert(intel_batchbuffer_space(batch) >= 4);
-#endif
*batch->map_next++ = dword;
assert(batch->ring != UNKNOWN_RING);
}
#endif
}
+static inline bool
+brw_ptr_in_state_buffer(struct intel_batchbuffer *batch, void *p)
+{
+ return (char *) p >= (char *) batch->state_map &&
+ (char *) p < (char *) batch->state_map + batch->state_bo->size;
+}
+
#define BEGIN_BATCH(n) do { \
intel_batchbuffer_begin(brw, (n), RENDER_RING); \
uint32_t *__map = brw->batch.map_next; \