From f50645d05c6dffa6463856ded0b8461ac9d24535 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 15 Sep 2015 16:00:26 +0200 Subject: [PATCH] i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation There are some bug reports about shaders failing to compile in gen6 because MRF 14 is used when we need to spill. For example: https://bugs.freedesktop.org/show_bug.cgi?id=86469 https://bugs.freedesktop.org/show_bug.cgi?id=90631 Discussion in bugzilla pointed to the fact that gen6 might actually have 24 MRF registers available instead of 16, so we could use other MRF registers and avoid these conflicts (we still need to investigate why some shaders need up to MRF 14 anyway, since this is not expected). Notice that the hardware docs are not clear about this fact: SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device Hardware" says "Number per Thread" - "24 registers" However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says: "Normal threads should construct their messages in m1..m15. (...) Regardless of actual hardware implementation, the thread should not assume th at MRF addresses above m15 wrap to legal MRF registers." Therefore experimentation was necessary to evaluate if we had these extra MRF registers available or not. This was tested in gen6 using MRF registers 21..23 for spilling and doing a full piglit run (all.py) forcing spilling of everything on the FS backend. It was also tested by doing spilling of everything on both the FS and the VS backends with a piglit run of shader.py. In both cases no regressions were observed. In fact, many of these tests where helped in the cases where we forced spilling, since that triggered the same underlying problem described in the bug reports. Here are some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on gen6 hardware: Using MRFs 13..15 for spilling: crash: 2, fail: 113, pass: 6621, skip: 5461 Using MRFs 21..23 for spilling: crash: 2, fail: 12, pass: 6722, skip: 5461 This patch sets the ground for later patches to implement spilling using MRF registers 21..23 in gen6. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_eu_emit.c | 6 +++--- src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++-- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 12 ++++++------ .../drivers/dri/i965/brw_fs_reg_allocate.cpp | 16 ++++++++-------- src/mesa/drivers/dri/i965/brw_ir_vec4.h | 2 +- src/mesa/drivers/dri/i965/brw_reg.h | 2 +- .../dri/i965/brw_schedule_instructions.cpp | 4 ++-- src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 10 +++++----- 8 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 23a120ea72d..6a4e316b43c 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -147,7 +147,7 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) const struct brw_device_info *devinfo = p->devinfo; if (dest.file == BRW_MESSAGE_REGISTER_FILE) - assert(dest.nr < BRW_MAX_MRF); + assert(dest.nr < BRW_MAX_MRF(devinfo->gen)); else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE) assert(dest.nr < 128); @@ -311,7 +311,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) const struct brw_device_info *devinfo = p->devinfo; if (reg.file == BRW_MESSAGE_REGISTER_FILE) - assert(reg.nr < BRW_MAX_MRF); + assert(reg.nr < BRW_MAX_MRF(devinfo->gen)); else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE) assert(reg.nr < 128); @@ -2485,7 +2485,7 @@ void brw_urb_WRITE(struct brw_codegen *p, insn = next_insn(p, BRW_OPCODE_SEND); - assert(msg_length < BRW_MAX_MRF); + assert(msg_length < BRW_MAX_MRF(devinfo->gen)); brw_set_dest(p, insn, dest); brw_set_src0(p, insn, src0); diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b4d05674260..225a3122c79 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2795,7 +2795,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, { int write_len = inst->regs_written; int first_write_grf = inst->dst.reg; - bool needs_dep[BRW_MAX_MRF]; + bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; assert(write_len < (int)sizeof(needs_dep) - 1); memset(needs_dep, false, sizeof(needs_dep)); @@ -2866,7 +2866,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins { int write_len = inst->regs_written; int first_write_grf = inst->dst.reg; - bool needs_dep[BRW_MAX_MRF]; + bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; assert(write_len < (int)sizeof(needs_dep) - 1); memset(needs_dep, false, sizeof(needs_dep)); diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index b974e9c4626..c65084d0608 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -48,13 +48,13 @@ static uint32_t brw_file_from_reg(fs_reg *reg) } static struct brw_reg -brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg) +brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen) { struct brw_reg brw_reg; switch (reg->file) { case MRF: - assert((reg->reg & ~(1 << 7)) < BRW_MAX_MRF); + assert((reg->reg & ~(1 << 7)) < BRW_MAX_MRF(gen)); /* Fallthrough */ case GRF: if (reg->stride == 0) { @@ -420,7 +420,7 @@ fs_generator::generate_blorp_fb_write(fs_inst *inst) brw_fb_WRITE(p, 16 /* dispatch_width */, brw_message_reg(inst->base_mrf), - brw_reg_from_fs_reg(inst, &inst->src[0]), + brw_reg_from_fs_reg(inst, &inst->src[0], devinfo->gen), BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, inst->target, inst->mlen, @@ -1538,7 +1538,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); for (unsigned int i = 0; i < inst->sources; i++) { - src[i] = brw_reg_from_fs_reg(inst, &inst->src[i]); + src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen); /* The accumulator result appears to get used for the * conditional modifier generation. When negating a UD @@ -1550,7 +1550,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) inst->src[i].type != BRW_REGISTER_TYPE_UD || !inst->src[i].negate); } - dst = brw_reg_from_fs_reg(inst, &inst->dst); + dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen); brw_set_default_predicate_control(p, inst->predicate); brw_set_default_predicate_inverse(p, inst->predicate_inverse); @@ -1560,7 +1560,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_set_default_acc_write_control(p, inst->writes_accumulator); brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); - assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF); + assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); assert(inst->mlen <= BRW_MAX_MSG_LENGTH); switch (inst->exec_size) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 570b4fedffe..21fb3de104a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -478,7 +478,7 @@ get_used_mrfs(fs_visitor *v, bool *mrf_used) { int reg_width = v->dispatch_width / 8; - memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool)); + memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool)); foreach_block_and_inst(block, fs_inst, inst, v->cfg) { if (inst->dst.file == MRF) { @@ -509,11 +509,11 @@ static void setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g, int first_mrf_node, int *first_used_mrf) { - bool mrf_used[BRW_MAX_MRF]; + bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)]; get_used_mrfs(v, mrf_used); - *first_used_mrf = BRW_MAX_MRF; - for (int i = 0; i < BRW_MAX_MRF; i++) { + *first_used_mrf = BRW_MAX_MRF(v->devinfo->gen); + for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) { /* Mark each MRF reg node as being allocated to its physical register. * * The alternative would be to have per-physical-register classes, which @@ -593,7 +593,7 @@ fs_visitor::assign_regs(bool allow_spilling) setup_payload_interference(g, payload_node_count, first_payload_node); if (devinfo->gen >= 7) { - int first_used_mrf = BRW_MAX_MRF; + int first_used_mrf = BRW_MAX_MRF(devinfo->gen); setup_mrf_hack_interference(this, g, first_mrf_hack_node, &first_used_mrf); @@ -616,7 +616,7 @@ fs_visitor::assign_regs(bool allow_spilling) * register early enough in the register file that we don't * conflict with any used MRF hack registers. */ - reg -= BRW_MAX_MRF - first_used_mrf; + reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf; ra_set_node_reg(g, inst->src[0].reg, reg); break; @@ -853,10 +853,10 @@ fs_visitor::spill_reg(int spill_reg) * SIMD16 mode, because we'd stomp the FB writes. */ if (!spilled_any_registers) { - bool mrf_used[BRW_MAX_MRF]; + bool mrf_used[BRW_MAX_MRF(devinfo->gen)]; get_used_mrfs(this, mrf_used); - for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) { + for (int i = spill_base_mrf; i < BRW_MAX_MRF(devinfo->gen); i++) { if (mrf_used[i]) { fail("Register spilling not supported with m%d used", i); return; diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 966a410a15d..6e8b16139d3 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -161,7 +161,7 @@ public: const src_reg &src1 = src_reg(), const src_reg &src2 = src_reg()); - struct brw_reg get_dst(void); + struct brw_reg get_dst(unsigned gen); struct brw_reg get_src(const struct brw_vue_prog_data *prog_data, int i); dst_reg dst; diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index 06d9269f4d8..87e7e011541 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -70,7 +70,7 @@ struct brw_device_info; #define GEN7_MRF_HACK_START 112 /** Number of message register file registers */ -#define BRW_MAX_MRF 16 +#define BRW_MAX_MRF(gen) (gen == 6 ? 24 : 16) #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6)) #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3) diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index b49961fff68..4e43e5ccdbd 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -762,7 +762,7 @@ fs_instruction_scheduler::calculate_deps() * GRF registers. */ schedule_node *last_grf_write[grf_count * 16]; - schedule_node *last_mrf_write[BRW_MAX_MRF]; + schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; schedule_node *last_conditional_mod[2] = { NULL, NULL }; schedule_node *last_accumulator_write = NULL; /* Fixed HW registers are assumed to be separate from the virtual @@ -1035,7 +1035,7 @@ void vec4_instruction_scheduler::calculate_deps() { schedule_node *last_grf_write[grf_count]; - schedule_node *last_mrf_write[BRW_MAX_MRF]; + schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; schedule_node *last_conditional_mod = NULL; schedule_node *last_accumulator_write = NULL; /* Fixed HW registers are assumed to be separate from the virtual diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 6618275e391..05f20441adb 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -34,7 +34,7 @@ extern "C" { namespace brw { struct brw_reg -vec4_instruction::get_dst(void) +vec4_instruction::get_dst(unsigned gen) { struct brw_reg brw_reg; @@ -46,7 +46,7 @@ vec4_instruction::get_dst(void) break; case MRF: - assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF); + assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(gen)); brw_reg = brw_message_reg(dst.reg + dst.reg_offset); brw_reg = retype(brw_reg, dst.type); brw_reg.dw1.bits.writemask = dst.writemask; @@ -490,7 +490,7 @@ vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst) brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, get_element_ud(inst->get_dst(), 0), + brw_MOV(p, get_element_ud(inst->get_dst(devinfo->gen), 0), get_element_ud(inst->get_src(this->prog_data, 0), 0)); brw_set_default_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); @@ -1126,7 +1126,7 @@ vec4_generator::generate_code(const cfg_t *cfg) for (unsigned int i = 0; i < 3; i++) { src[i] = inst->get_src(this->prog_data, i); } - dst = inst->get_dst(); + dst = inst->get_dst(devinfo->gen); brw_set_default_predicate_control(p, inst->predicate); brw_set_default_predicate_inverse(p, inst->predicate_inverse); @@ -1135,7 +1135,7 @@ vec4_generator::generate_code(const cfg_t *cfg) brw_set_default_mask_control(p, inst->force_writemask_all); brw_set_default_acc_write_control(p, inst->writes_accumulator); - assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF); + assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); assert(inst->mlen <= BRW_MAX_MSG_LENGTH); unsigned pre_emit_nr_insn = p->nr_insn; -- 2.30.2