X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_generator.cpp;h=74c6cd3047411a7f9a113622e1acb0063e16f954;hp=f05468e73550099e7ebcaa42857a752b478f246b;hb=90b6745bc80cf6dabb8f736dbf12d47c2a6602f5;hpb=6634ede7aac30ac8d21b9acc9a67010927ec93eb diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index f05468e7355..74c6cd30474 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -186,14 +186,12 @@ brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst, fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, struct brw_stage_prog_data *prog_data, - struct shader_stats shader_stats, bool runtime_check_aads_emit, gl_shader_stage stage) : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo), prog_data(prog_data), - shader_stats(shader_stats), runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), stage(stage), mem_ctx(mem_ctx) { @@ -226,25 +224,27 @@ public: bool fs_generator::patch_discard_jumps_to_fb_writes() { - if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) + if (this->discard_halt_patches.is_empty()) return false; int scale = brw_jump_scale(p->devinfo); - /* There is a somewhat strange undocumented requirement of using - * HALT, according to the simulator. If some channel has HALTed to - * a particular UIP, then by the end of the program, every channel - * must have HALTed to that UIP. Furthermore, the tracking is a - * stack, so you can't do the final halt of a UIP after starting - * halting to a new UIP. - * - * Symptoms of not emitting this instruction on actual hardware - * included GPU hangs and sparkly rendering on the piglit discard - * tests. - */ - brw_inst *last_halt = gen6_HALT(p); - brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); - brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + if (devinfo->gen >= 6) { + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + brw_inst *last_halt = brw_HALT(p); + brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); + brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + } int ip = p->nr_insn; @@ -252,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes() brw_inst *patch = &p->store[patch_ip->ip]; assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); - /* HALT takes a half-instruction distance from the pre-incremented IP. */ - brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + if (devinfo->gen >= 6) { + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + } else { + brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); + } } this->discard_halt_patches.make_empty(); + + if (devinfo->gen < 6) { + /* From the g965 PRM: + * + * "As DMask is not automatically reloaded into AMask upon completion + * of this instruction, software has to manually restore AMask upon + * completion." + * + * DMask lives in the bottom 16 bits of sr0.1. + */ + brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), + retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); + brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); + brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); + brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); + } + + if (devinfo->gen == 4 && !devinfo->is_g4x) { + /* From the g965 PRM: + * + * "[DevBW, DevCL] Erratum: The subfields in mask stack register are + * reset to zero during graphics reset, however, they are not + * initialized at thread dispatch. These subfields will retain the + * values from the previous thread. Software should make sure the + * mask stack is empty (reset to zero) before terminating the thread. + * In case that this is not practical, software may have to reset the + * mask stack at the beginning of each kernel, which will impact the + * performance." + * + * Luckily we can rely on: + * + * "[DevBW, DevCL] This register access restriction is not + * applicable, hardware does ensure execution pipeline coherency, + * when a mask stack register is used as an explicit source and/or + * destination." + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + brw_set_default_exec_size(p, BRW_EXECUTE_2); + brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); + + brw_set_default_exec_size(p, BRW_EXECUTE_16); + /* Reset the if stack. */ + brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + + brw_pop_insn_state(p); + } + return true; } @@ -285,7 +341,8 @@ fs_generator::generate_send(fs_inst *inst, desc, desc_imm, ex_desc, ex_desc_imm, inst->eot); if (inst->check_tdr) - brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDSC); + brw_inst_set_opcode(p->devinfo, brw_last_inst, + devinfo->gen >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); } else { brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, inst->eot); @@ -411,7 +468,15 @@ fs_generator::generate_mov_indirect(fs_inst *inst, reg.nr = imm_byte_offset / REG_SIZE; reg.subnr = imm_byte_offset % REG_SIZE; - brw_MOV(p, dst, reg); + if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) { + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(reg, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(reg, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, reg); + } } else { /* Prior to Broadwell, there are only 8 address registers. */ assert(inst->exec_size <= 8 || devinfo->gen >= 8); @@ -419,6 +484,13 @@ fs_generator::generate_mov_indirect(fs_inst *inst, /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ struct brw_reg addr = vec8(brw_address_reg(0)); + /* Whether we can use destination dependency control without running the + * risk of a hang if an instruction gets shot down. + */ + const bool use_dep_ctrl = !inst->predicate && + inst->exec_size == dispatch_width; + brw_inst *insn; + /* The destination stride of an instruction (in bytes) must be greater * than or equal to the size of the rest of the instruction. Since the * address register is of type UW, we can't use a D-type instruction. @@ -451,13 +523,34 @@ fs_generator::generate_mov_indirect(fs_inst *inst, * In the end, while base_offset is nice to look at in the generated * code, using it saves us 0 instructions and would require quite a bit * of case-by-case work. It's just not worth it. + * + * Due to a hardware bug some platforms (particularly Gen11+) seem to + * require the address components of all channels to be valid whether or + * not they're active, which causes issues if we use VxH addressing + * under non-uniform control-flow. We can easily work around that by + * initializing the whole address register with a pipelined NoMask MOV + * instruction. */ - brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 7) { + insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); + } + + insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + else if (devinfo->gen >= 7) + brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); if (type_sz(reg.type) > 4 && ((devinfo->gen == 7 && !devinfo->is_haswell) || devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) || - !devinfo->has_64bit_types)) { + !devinfo->has_64bit_float)) { /* IVB has an issue (which we found empirically) where it reads two * address register components per channel for indirectly addressed * 64-bit sources. @@ -475,6 +568,7 @@ fs_generator::generate_mov_indirect(fs_inst *inst, */ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); } else { @@ -559,10 +653,11 @@ fs_generator::generate_shuffle(fs_inst *inst, /* Take into account the component size and horizontal stride. */ assert(src.vstride == src.hstride + src.width); brw_SHL(p, addr, group_idx, - brw_imm_uw(_mesa_logbase2(type_sz(src.type)) + + brw_imm_uw(util_logbase2(type_sz(src.type)) + src.hstride - 1)); /* Add on the register start offset */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr)); if (type_sz(src.type) > 4 && @@ -590,6 +685,7 @@ fs_generator::generate_shuffle(fs_inst *inst, assert(dst.hstride == 1); brw_MOV(p, dst_d, retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, byte_offset(dst_d, 4), retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); } else { @@ -597,6 +693,8 @@ fs_generator::generate_shuffle(fs_inst *inst, retype(brw_VxH_indirect(0, 0), src.type)); } } + + brw_set_default_swsb(p, tgl_swsb_null()); } } @@ -657,8 +755,12 @@ fs_generator::generate_quad_swizzle(const fs_inst *inst, 4 * inst->dst.stride, 1, 4 * inst->dst.stride), stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); - brw_inst_set_no_dd_clear(devinfo, insn, c < 3); - brw_inst_set_no_dd_check(devinfo, insn, c > 0); + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(devinfo, insn, c < 3); + brw_inst_set_no_dd_check(devinfo, insn, c > 0); + } + + brw_set_default_swsb(p, tgl_swsb_null()); } break; @@ -757,13 +859,16 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) brw_inst_set_header_present(devinfo, insn, false); brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ - brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ - /* Note that even though the thread has a URB resource associated with it, - * we set the "do not dereference URB" bit, because the URB resource is - * managed by the fixed-function unit, so it will free it automatically. - */ - brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + if (devinfo->gen < 11) { + brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ + + /* Note that even though the thread has a URB resource associated with it, + * we set the "do not dereference URB" bit, because the URB resource is + * managed by the fixed-function unit, so it will free it automatically. + */ + brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + } brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); } @@ -772,7 +877,12 @@ void fs_generator::generate_barrier(fs_inst *, struct brw_reg src) { brw_barrier(p, src); - brw_WAIT(p); + if (devinfo->gen >= 12) { + brw_set_default_swsb(p, tgl_swsb_null()); + brw_SYNC(p, TGL_SYNC_BAR); + } else { + brw_WAIT(p); + } } bool @@ -1106,15 +1216,18 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, /* Set up an implied move from g0 to the MRF. */ src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); } else { + const tgl_swsb swsb = brw_get_default_swsb(p); assert(inst->base_mrf != -1); struct brw_reg header_reg = brw_message_reg(inst->base_mrf); brw_push_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); /* Explicitly set up the message header by copying g0 to the MRF. */ brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_set_default_exec_size(p, BRW_EXECUTE_1); if (inst->offset) { @@ -1124,6 +1237,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, } brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } } @@ -1270,6 +1384,7 @@ fs_generator::generate_ddy(const fs_inst *inst, brw_ADD(p, byte_offset(dst, g * type_size), negate(byte_offset(src, g * type_size)), byte_offset(src, (g + 2) * type_size)); + brw_set_default_swsb(p, tgl_swsb_null()); } brw_pop_insn_state(p); } else { @@ -1313,14 +1428,12 @@ fs_generator::generate_ddy(const fs_inst *inst, void fs_generator::generate_discard_jump(fs_inst *) { - assert(devinfo->gen >= 6); - /* This HALT will be patched up at FB write time to point UIP at the end of * the program, and at brw_uip_jip() JIP will be set to the end of the * current block (or the program). */ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); - gen6_HALT(p); + brw_HALT(p); } void @@ -1334,6 +1447,7 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : MIN2(16, inst->exec_size); const unsigned block_size = 4 * lower_size / REG_SIZE; + const tgl_swsb swsb = brw_get_default_swsb(p); assert(inst->mlen != 0); brw_push_insn_state(p); @@ -1343,9 +1457,17 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { brw_set_default_group(p, inst->group + lower_size * i); + if (i > 0) { + assert(swsb.mode & TGL_SBID_SET); + brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid)); + } else { + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + } + brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), block_size, inst->offset + block_size * REG_SIZE * i); @@ -1423,12 +1545,14 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, BRW_DATAPORT_READ_TARGET_DATA_CACHE)); } else { + const tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); /* a0.0 = surf_index & 0xff */ + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); brw_set_dest(p, insn_and, addr); @@ -1436,6 +1560,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); /* dst = send(payload, a0.0 | ) */ + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_send_indirect_message( p, GEN6_SFID_DATAPORT_CONSTANT_CACHE, retype(dst, BRW_REGISTER_TYPE_UD), @@ -1559,6 +1684,7 @@ fs_generator::generate_set_sample_id(fs_inst *inst, brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); brw_inst_set_compression(devinfo, insn, lower_size > 8); + brw_set_default_swsb(p, tgl_swsb_null()); } } @@ -1593,6 +1719,7 @@ fs_generator::generate_pack_half_2x16_split(fs_inst *, /* Now the form: * 0xhhhh0000 */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_SHL(p, dst, dst, brw_imm_ud(16u)); /* And, finally the form of packHalf2x16's output: @@ -1607,9 +1734,12 @@ fs_generator::generate_shader_time_add(fs_inst *, struct brw_reg offset, struct brw_reg value) { + const tgl_swsb swsb = brw_get_default_swsb(p); + assert(devinfo->gen >= 7); brw_push_insn_state(p); brw_set_default_mask_control(p, true); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); assert(payload.file == BRW_GENERAL_REGISTER_FILE); struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), @@ -1631,7 +1761,9 @@ fs_generator::generate_shader_time_add(fs_inst *, * out of this path, so we just emit the MOVs from here. */ brw_MOV(p, payload_offset, offset); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, payload_value, value); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_shader_time_add(p, payload, prog_data->binding_table.shader_time_start); brw_pop_insn_state(p); @@ -1646,17 +1778,26 @@ fs_generator::enable_debug(const char *shader_name) int fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, + struct shader_stats shader_stats, + const brw::performance &perf, struct brw_compile_stats *stats) { /* align to 64 byte boundary. */ - while (p->next_insn_offset % 64) - brw_NOP(p); + brw_realign(p, 64); this->dispatch_width = dispatch_width; int start_offset = p->next_insn_offset; + + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ int spill_count = 0, fill_count = 0; - int loop_count = 0; + int loop_count = 0, send_count = 0, nop_count = 0; + bool is_accum_used = false; struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); @@ -1685,6 +1826,29 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, inst->dst.component_size(inst->exec_size) > REG_SIZE) { brw_NOP(p); last_insn_offset = p->next_insn_offset; + + /* In order to avoid spurious instruction count differences when the + * instruction schedule changes, keep track of the number of inserted + * NOPs. + */ + nop_count++; + } + + /* GEN:BUG:14010017096: + * + * Clear accumulator register before end of thread. + */ + if (inst->eot && is_accum_used && devinfo->gen >= 12) { + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f)); + last_insn_offset = p->next_insn_offset; + } + + if (!is_accum_used && !inst->eot) { + is_accum_used = inst->writes_accumulator_implicitly(devinfo) || + inst->dst.is_accumulator(); } if (unlikely(debug_flag)) @@ -1737,6 +1901,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_set_default_saturate(p, inst->saturate); brw_set_default_mask_control(p, inst->force_writemask_all); brw_set_default_acc_write_control(p, inst->writes_accumulator); + brw_set_default_swsb(p, inst->sched); unsigned exec_size = inst->exec_size; if (devinfo->gen == 7 && !devinfo->is_haswell && @@ -1752,6 +1917,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, assert(inst->mlen <= BRW_MAX_MSG_LENGTH); switch (inst->opcode) { + case BRW_OPCODE_SYNC: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + brw_SYNC(p, tgl_sync_function(src[0].ud)); + break; case BRW_OPCODE_MOV: brw_MOV(p, dst, src[0]); break; @@ -1965,6 +2134,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_math_function(inst->opcode), inst->base_mrf, src[0], BRW_MATH_PRECISION_FULL); + send_count++; } break; case SHADER_OPCODE_INT_QUOTIENT: @@ -1982,6 +2152,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, gen4_math(p, dst, brw_math_function(inst->opcode), inst->base_mrf, src[0], BRW_MATH_PRECISION_FULL); + send_count++; } break; case FS_OPCODE_LINTERP: @@ -2001,10 +2172,20 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_SEND: generate_send(inst, dst, src[0], src[1], src[2], inst->ex_mlen > 0 ? src[3] : brw_null_reg()); + if ((inst->desc & 0xff) == BRW_BTI_STATELESS || + (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) { + if (inst->size_written) + fill_count++; + else + spill_count++; + } else { + send_count++; + } break; case SHADER_OPCODE_GET_BUFFER_SIZE: generate_get_buffer_size(inst, dst, src[0], src[1]); + send_count++; break; case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: @@ -2018,6 +2199,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_SAMPLEINFO: assert(inst->src[0].file == BAD_FILE); generate_tex(inst, dst, src[1], src[2]); + send_count++; break; case FS_OPCODE_DDX_COARSE: @@ -2051,6 +2233,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_URB_READ_SIMD8: case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: generate_urb_read(inst, dst, src[0]); + send_count++; break; case SHADER_OPCODE_URB_WRITE_SIMD8: @@ -2058,29 +2241,35 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: generate_urb_write(inst, src[0]); + send_count++; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: assert(inst->force_writemask_all); generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: assert(inst->force_writemask_all); generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: generate_varying_pull_constant_load_gen4(inst, dst, src[0]); + send_count++; break; case FS_OPCODE_REP_FB_WRITE: case FS_OPCODE_FB_WRITE: generate_fb_write(inst, src[0]); + send_count++; break; case FS_OPCODE_FB_READ: generate_fb_read(inst, dst, src[0]); + send_count++; break; case FS_OPCODE_DISCARD_JUMP: @@ -2091,16 +2280,50 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, generate_shader_time_add(inst, src[0], src[1], src[2]); break; - case SHADER_OPCODE_MEMORY_FENCE: + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: { assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud); + + const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? + BRW_OPCODE_SENDC : BRW_OPCODE_SEND; + + brw_memory_fence(p, dst, src[0], send_op, + brw_message_target(inst->sfid), + /* commit_enable */ src[1].ud, + /* bti */ src[2].ud); + send_count++; break; + } + + case FS_OPCODE_SCHEDULING_FENCE: + if (inst->sources == 0 && inst->sched.regdist == 0 && + inst->sched.mode == TGL_SBID_NULL) { + if (unlikely(debug_flag)) + disasm_info->use_tail = true; + break; + } + + if (devinfo->gen >= 12) { + /* Use the available SWSB information to stall. A single SYNC is + * sufficient since if there were multiple dependencies, the + * scoreboard algorithm already injected other SYNCs before this + * instruction. + */ + brw_SYNC(p, TGL_SYNC_NOP); + } else { + for (unsigned i = 0; i < inst->sources; i++) { + /* Emit a MOV to force a stall until the instruction producing the + * registers finishes. + */ + brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), + retype(src[i], BRW_REGISTER_TYPE_UW)); + } + + if (inst->sources > 1) + multiple_instructions_emitted = true; + } - case SHADER_OPCODE_INTERLOCK: - assert(devinfo->gen >= 9); - /* The interlock is basically a memory fence issued via sendc */ - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { @@ -2112,7 +2335,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_find_live_channel(p, dst, mask); break; } - + case FS_OPCODE_LOAD_LIVE_CHANNELS: { + assert(devinfo->gen >= 8); + assert(inst->force_writemask_all && inst->group == 0); + assert(inst->dst.file == BAD_FILE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), + BRW_REGISTER_TYPE_UD), + retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + break; + } case SHADER_OPCODE_BROADCAST: assert(inst->force_writemask_all); brw_broadcast(p, dst, src[0], src[1]); @@ -2127,6 +2359,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_MOV(p, dst, src[1]); brw_set_default_mask_control(p, BRW_MASK_ENABLE); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, dst, src[0]); break; @@ -2176,6 +2409,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, assert(src[0].type == dst.type); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), subscript(strided, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), subscript(strided, BRW_REGISTER_TYPE_D, 1)); } else { @@ -2206,24 +2440,29 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case FS_OPCODE_INTERPOLATE_AT_SAMPLE: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); + send_count++; break; case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); + send_count++; break; case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); + send_count++; break; case CS_OPCODE_CS_TERMINATE: generate_cs_terminate(inst, src[0]); + send_count++; break; case SHADER_OPCODE_BARRIER: generate_barrier(inst, src[0]); + send_count++; break; case BRW_OPCODE_DIM: @@ -2270,8 +2509,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, if (inst->conditional_mod) brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); - brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); - brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + } } } @@ -2304,14 +2545,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, fprintf(stderr, "Native code for %s (sha1 %s)\n" "SIMD%d shader: %d instructions. %d loops. %u cycles. " - "%d:%d spills:fills. " + "%d:%d spills:fills, %u sends, " "scheduled with mode %s. " "Promoted %u constants. " "Compacted %d to %d bytes (%.0f%%)\n", shader_name, sha1buf, dispatch_width, before_size / 16, - loop_count, cfg->cycle_count, - spill_count, fill_count, + loop_count, perf.latency, + spill_count, fill_count, send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, before_size, after_size, @@ -2319,7 +2560,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, /* overriding the shader makes disasm_info invalid */ if (!brw_try_override_assembly(p, start_offset, sha1buf)) { - dump_assembly(p->store, disasm_info); + dump_assembly(p->store, start_offset, p->next_insn_offset, + disasm_info, perf.block_latency); } else { fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); } @@ -2329,22 +2571,23 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, compiler->shader_debug_log(log_data, "%s SIMD%d shader: %d inst, %d loops, %u cycles, " - "%d:%d spills:fills, " + "%d:%d spills:fills, %u sends, " "scheduled with mode %s, " "Promoted %u constants, " "compacted %d to %d bytes.", _mesa_shader_stage_to_abbrev(stage), - dispatch_width, before_size / 16, - loop_count, cfg->cycle_count, - spill_count, fill_count, + dispatch_width, before_size / 16 - nop_count, + loop_count, perf.latency, + spill_count, fill_count, send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, before_size, after_size); if (stats) { stats->dispatch_width = dispatch_width; - stats->instructions = before_size / 16; + stats->instructions = before_size / 16 - nop_count; + stats->sends = send_count; stats->loops = loop_count; - stats->cycles = cfg->cycle_count; + stats->cycles = perf.latency; stats->spills = spill_count; stats->fills = fill_count; } @@ -2352,6 +2595,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, return start_offset; } +void +fs_generator::add_const_data(void *data, unsigned size) +{ + assert(prog_data->const_data_size == 0); + if (size > 0) { + prog_data->const_data_size = size; + prog_data->const_data_offset = brw_append_data(p, data, size, 32); + } +} + const unsigned * fs_generator::get_assembly() {