From: Eric Anholt Date: Fri, 15 Apr 2011 02:36:28 +0000 (-0700) Subject: i965/fs: Add gen6 register spilling support. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=59c6b775a6aacfe03c84dae62c2fd45d4af9d70b;p=mesa.git i965/fs: Add gen6 register spilling support. Most of this is code movement to get the scratch space allocated in a shared location. Other than that, the only real changes are that the old oword block messages now operate on oword-aligned areas (with new messages for unaligned access, which we don't do), and that the caching control is in the SFID part of the descriptor instead of message control. Fixes glsl-fs-convolution-1. --- diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 57313a59c08..2d654e71432 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -496,6 +496,8 @@ static void brw_set_dp_write_message( struct brw_context *brw, insn->bits3.dp_render_cache.response_length = response_length; insn->bits3.dp_render_cache.msg_length = msg_length; insn->bits3.dp_render_cache.end_of_thread = end_of_thread; + + /* We always use the render cache for write messages */ insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE; /* XXX really need below? */ insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE; @@ -539,6 +541,13 @@ brw_set_dp_read_message(struct brw_context *brw, brw_set_src1(insn, brw_imm_d(0)); if (intel->gen >= 6) { + uint32_t target_function; + + if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE) + target_function = BRW_MESSAGE_TARGET_DATAPORT_READ; /* data cache */ + else + target_function = BRW_MESSAGE_TARGET_DATAPORT_WRITE; /* render cache */ + insn->bits3.dp_render_cache.binding_table_index = binding_table_index; insn->bits3.dp_render_cache.msg_control = msg_control; insn->bits3.dp_render_cache.pixel_scoreboard_clear = 0; @@ -548,9 +557,9 @@ brw_set_dp_read_message(struct brw_context *brw, insn->bits3.dp_render_cache.response_length = response_length; insn->bits3.dp_render_cache.msg_length = msg_length; insn->bits3.dp_render_cache.end_of_thread = 0; - insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_READ; + insn->header.destreg__conditionalmod = target_function; /* XXX really need below? */ - insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ; + insn->bits2.send_gen5.sfid = target_function; insn->bits2.send_gen5.end_of_thread = 0; } else if (intel->gen == 5) { insn->bits3.dp_read_gen5.binding_table_index = binding_table_index; @@ -1486,9 +1495,12 @@ void brw_oword_block_write_scratch(struct brw_compile *p, GLuint offset) { struct intel_context *intel = &p->brw->intel; - uint32_t msg_control; + uint32_t msg_control, msg_type; int mlen; + if (intel->gen >= 6) + offset /= 16; + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); if (num_regs == 1) { @@ -1554,13 +1566,22 @@ void brw_oword_block_write_scratch(struct brw_compile *p, } brw_set_dest(p, insn, dest); - brw_set_src0(insn, brw_null_reg()); + if (intel->gen >= 6) { + brw_set_src0(insn, mrf); + } else { + brw_set_src0(insn, brw_null_reg()); + } + + if (intel->gen >= 6) + msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; + else + msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; brw_set_dp_write_message(p->brw, insn, 255, /* binding table index (255=stateless) */ msg_control, - BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */ + msg_type, mlen, GL_TRUE, /* header_present */ 0, /* pixel scoreboard */ @@ -1585,9 +1606,13 @@ brw_oword_block_read_scratch(struct brw_compile *p, int num_regs, GLuint offset) { + struct intel_context *intel = &p->brw->intel; uint32_t msg_control; int rlen; + if (intel->gen >= 6) + offset /= 16; + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); dest = retype(dest, BRW_REGISTER_TYPE_UW); @@ -1624,14 +1649,18 @@ brw_oword_block_read_scratch(struct brw_compile *p, insn->header.destreg__conditionalmod = mrf.nr; brw_set_dest(p, insn, dest); /* UW? */ - brw_set_src0(insn, brw_null_reg()); + if (intel->gen >= 6) { + brw_set_src0(insn, mrf); + } else { + brw_set_src0(insn, brw_null_reg()); + } brw_set_dp_read_message(p->brw, insn, 255, /* binding table index (255=stateless) */ msg_control, BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ - 1, /* target cache (render/scratch) */ + BRW_DATAPORT_READ_TARGET_RENDER_CACHE, 1, /* msg_length */ rlen); } @@ -1839,7 +1868,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p, bind_table_index, BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, msg_type, - 0, /* source cache = data cache */ + BRW_DATAPORT_READ_TARGET_DATA_CACHE, 2, /* msg_length */ 1); /* response_length */ } diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 479a91436a7..67f29ce1816 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -228,8 +228,6 @@ fs_visitor::assign_regs() if (reg == -1) { fail("no register to spill\n"); - } else if (intel->gen >= 6) { - fail("no spilling support on gen6 yet\n"); } else { spill_reg(reg); } diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 2dd28fd1c58..ab731a807a7 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -185,6 +185,7 @@ static void do_wm_prog( struct brw_context *brw, struct brw_fragment_program *fp, struct brw_wm_prog_key *key) { + struct intel_context *intel = &brw->intel; struct brw_wm_compile *c; const GLuint *program; GLuint program_size; @@ -238,12 +239,26 @@ static void do_wm_prog( struct brw_context *brw, /* Scratch space is used for register spilling */ if (c->last_scratch) { + uint32_t total_scratch; + /* Per-thread scratch space is power-of-two sized. */ for (c->prog_data.total_scratch = 1024; c->prog_data.total_scratch <= c->last_scratch; c->prog_data.total_scratch *= 2) { /* empty */ } + total_scratch = c->prog_data.total_scratch * brw->wm_max_threads; + + if (brw->wm.scratch_bo && total_scratch > brw->wm.scratch_bo->size) { + drm_intel_bo_unreference(brw->wm.scratch_bo); + brw->wm.scratch_bo = NULL; + } + if (brw->wm.scratch_bo == NULL) { + brw->wm.scratch_bo = drm_intel_bo_alloc(intel->bufmgr, + "wm scratch", + total_scratch, + 4096); + } } else { c->prog_data.total_scratch = 0; diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 5b5afc4626b..be4b260a5ff 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -278,30 +278,10 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, static void upload_wm_unit( struct brw_context *brw ) { - struct intel_context *intel = &brw->intel; struct brw_wm_unit_key key; drm_intel_bo *reloc_bufs[3]; wm_unit_populate_key(brw, &key); - /* Allocate the necessary scratch space if we haven't already. Don't - * bother reducing the allocation later, since we use scratch so - * rarely. - */ - if (key.total_scratch) { - GLuint total = key.total_scratch * brw->wm_max_threads; - - if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) { - drm_intel_bo_unreference(brw->wm.scratch_bo); - brw->wm.scratch_bo = NULL; - } - if (brw->wm.scratch_bo == NULL) { - brw->wm.scratch_bo = drm_intel_bo_alloc(intel->bufmgr, - "wm scratch", - total, - 4096); - } - } - reloc_bufs[0] = brw->wm.prog_bo; reloc_bufs[1] = brw->wm.scratch_bo; reloc_bufs[2] = brw->wm.sampler_bo; diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index f4f04750aeb..8215cb15a9c 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -184,7 +184,12 @@ upload_wm_state(struct brw_context *brw) OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2)); OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); OUT_BATCH(dw2); - OUT_BATCH(0); /* scratch space base offset */ + if (brw->wm.prog_data->total_scratch) { + OUT_RELOC(brw->wm.scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, + ffs(brw->wm.prog_data->total_scratch) - 11); + } else { + OUT_BATCH(0); + } OUT_BATCH(dw4); OUT_BATCH(dw5); OUT_BATCH(dw6);