X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_generator.cpp;h=74c6cd3047411a7f9a113622e1acb0063e16f954;hb=90b6745bc80cf6dabb8f736dbf12d47c2a6602f5;hp=b50f03142b54456571c5fb4906847da48eeddf34;hpb=c76f2292b556502018ecc591f3388516c8ded469;p=mesa.git diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index b50f03142b5..74c6cd30474 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -224,25 +224,27 @@ public: bool fs_generator::patch_discard_jumps_to_fb_writes() { - if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) + if (this->discard_halt_patches.is_empty()) return false; int scale = brw_jump_scale(p->devinfo); - /* There is a somewhat strange undocumented requirement of using - * HALT, according to the simulator. If some channel has HALTed to - * a particular UIP, then by the end of the program, every channel - * must have HALTed to that UIP. Furthermore, the tracking is a - * stack, so you can't do the final halt of a UIP after starting - * halting to a new UIP. - * - * Symptoms of not emitting this instruction on actual hardware - * included GPU hangs and sparkly rendering on the piglit discard - * tests. - */ - brw_inst *last_halt = gen6_HALT(p); - brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); - brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + if (devinfo->gen >= 6) { + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + brw_inst *last_halt = brw_HALT(p); + brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); + brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + } int ip = p->nr_insn; @@ -250,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes() brw_inst *patch = &p->store[patch_ip->ip]; assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); - /* HALT takes a half-instruction distance from the pre-incremented IP. */ - brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + if (devinfo->gen >= 6) { + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + } else { + brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); + } } this->discard_halt_patches.make_empty(); + + if (devinfo->gen < 6) { + /* From the g965 PRM: + * + * "As DMask is not automatically reloaded into AMask upon completion + * of this instruction, software has to manually restore AMask upon + * completion." + * + * DMask lives in the bottom 16 bits of sr0.1. + */ + brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), + retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); + brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); + brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); + brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); + } + + if (devinfo->gen == 4 && !devinfo->is_g4x) { + /* From the g965 PRM: + * + * "[DevBW, DevCL] Erratum: The subfields in mask stack register are + * reset to zero during graphics reset, however, they are not + * initialized at thread dispatch. These subfields will retain the + * values from the previous thread. Software should make sure the + * mask stack is empty (reset to zero) before terminating the thread. + * In case that this is not practical, software may have to reset the + * mask stack at the beginning of each kernel, which will impact the + * performance." + * + * Luckily we can rely on: + * + * "[DevBW, DevCL] This register access restriction is not + * applicable, hardware does ensure execution pipeline coherency, + * when a mask stack register is used as an explicit source and/or + * destination." + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + brw_set_default_exec_size(p, BRW_EXECUTE_2); + brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); + + brw_set_default_exec_size(p, BRW_EXECUTE_16); + /* Reset the if stack. */ + brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + + brw_pop_insn_state(p); + } + return true; } @@ -410,7 +468,15 @@ fs_generator::generate_mov_indirect(fs_inst *inst, reg.nr = imm_byte_offset / REG_SIZE; reg.subnr = imm_byte_offset % REG_SIZE; - brw_MOV(p, dst, reg); + if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) { + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(reg, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(reg, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, reg); + } } else { /* Prior to Broadwell, there are only 8 address registers. */ assert(inst->exec_size <= 8 || devinfo->gen >= 8); @@ -587,7 +653,7 @@ fs_generator::generate_shuffle(fs_inst *inst, /* Take into account the component size and horizontal stride. */ assert(src.vstride == src.hstride + src.width); brw_SHL(p, addr, group_idx, - brw_imm_uw(_mesa_logbase2(type_sz(src.type)) + + brw_imm_uw(util_logbase2(type_sz(src.type)) + src.hstride - 1)); /* Add on the register start offset */ @@ -1362,14 +1428,12 @@ fs_generator::generate_ddy(const fs_inst *inst, void fs_generator::generate_discard_jump(fs_inst *) { - assert(devinfo->gen >= 6); - /* This HALT will be patched up at FB write time to point UIP at the end of * the program, and at brw_uip_jip() JIP will be set to the end of the * current block (or the program). */ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); - gen6_HALT(p); + brw_HALT(p); } void @@ -1715,11 +1779,11 @@ fs_generator::enable_debug(const char *shader_name) int fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, struct shader_stats shader_stats, + const brw::performance &perf, struct brw_compile_stats *stats) { /* align to 64 byte boundary. */ - while (p->next_insn_offset % 64) - brw_NOP(p); + brw_realign(p, 64); this->dispatch_width = dispatch_width; @@ -2216,25 +2280,50 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, generate_shader_time_add(inst, src[0], src[1], src[2]); break; + case SHADER_OPCODE_INTERLOCK: case SHADER_OPCODE_MEMORY_FENCE: { assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[2].file == BRW_IMMEDIATE_VALUE); - const unsigned sends = - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, - src[2].ud); - send_count += sends; + + const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? + BRW_OPCODE_SENDC : BRW_OPCODE_SEND; + + brw_memory_fence(p, dst, src[0], send_op, + brw_message_target(inst->sfid), + /* commit_enable */ src[1].ud, + /* bti */ src[2].ud); + send_count++; break; } case FS_OPCODE_SCHEDULING_FENCE: - if (unlikely(debug_flag)) - disasm_info->use_tail = true; - break; + if (inst->sources == 0 && inst->sched.regdist == 0 && + inst->sched.mode == TGL_SBID_NULL) { + if (unlikely(debug_flag)) + disasm_info->use_tail = true; + break; + } + + if (devinfo->gen >= 12) { + /* Use the available SWSB information to stall. A single SYNC is + * sufficient since if there were multiple dependencies, the + * scoreboard algorithm already injected other SYNCs before this + * instruction. + */ + brw_SYNC(p, TGL_SYNC_NOP); + } else { + for (unsigned i = 0; i < inst->sources; i++) { + /* Emit a MOV to force a stall until the instruction producing the + * registers finishes. + */ + brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), + retype(src[i], BRW_REGISTER_TYPE_UW)); + } + + if (inst->sources > 1) + multiple_instructions_emitted = true; + } - case SHADER_OPCODE_INTERLOCK: - assert(devinfo->gen >= 9); - /* The interlock is basically a memory fence issued via sendc */ - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { @@ -2462,7 +2551,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, "Compacted %d to %d bytes (%.0f%%)\n", shader_name, sha1buf, dispatch_width, before_size / 16, - loop_count, cfg->cycle_count, + loop_count, perf.latency, spill_count, fill_count, send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, @@ -2471,7 +2560,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, /* overriding the shader makes disasm_info invalid */ if (!brw_try_override_assembly(p, start_offset, sha1buf)) { - dump_assembly(p->store, disasm_info); + dump_assembly(p->store, start_offset, p->next_insn_offset, + disasm_info, perf.block_latency); } else { fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); } @@ -2487,7 +2577,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, "compacted %d to %d bytes.", _mesa_shader_stage_to_abbrev(stage), dispatch_width, before_size / 16 - nop_count, - loop_count, cfg->cycle_count, + loop_count, perf.latency, spill_count, fill_count, send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, @@ -2497,7 +2587,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, stats->instructions = before_size / 16 - nop_count; stats->sends = send_count; stats->loops = loop_count; - stats->cycles = cfg->cycle_count; + stats->cycles = perf.latency; stats->spills = spill_count; stats->fills = fill_count; } @@ -2505,6 +2595,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, return start_offset; } +void +fs_generator::add_const_data(void *data, unsigned size) +{ + assert(prog_data->const_data_size == 0); + if (size > 0) { + prog_data->const_data_size = size; + prog_data->const_data_offset = brw_append_data(p, data, size, 32); + } +} + const unsigned * fs_generator::get_assembly() {