X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_reg_allocate.cpp;h=d5c4f032182d4101cf6008ffc34381ff8099d59c;hb=0778748ebadca7991c541ec674024c2bec5f08f5;hp=ec8e116cb384d6c4756b2cb4a9dabc1a27871c2e;hpb=58324389be7bc7c5e10093b9cc0a8efa9b4c93a9;p=mesa.git diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index ec8e116cb38..d5c4f032182 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -548,6 +548,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) int first_mrf_hack_node = node_count; if (devinfo->gen >= 7) node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; + int grf127_send_hack_node = node_count; + if (devinfo->gen >= 8) + node_count ++; struct ra_graph *g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count); @@ -614,7 +617,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) * highest register that works. */ if (inst->eot) { - int size = alloc.sizes[inst->src[0].nr]; + const int vgrf = inst->opcode == SHADER_OPCODE_SEND ? + inst->src[2].nr : inst->src[0].nr; + int size = alloc.sizes[vgrf]; int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; /* If something happened to spill, we want to push the EOT send @@ -623,31 +628,94 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) */ reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf; - ra_set_node_reg(g, inst->src[0].nr, reg); + ra_set_node_reg(g, vgrf, reg); break; } } } - if (dispatch_width > 8) { - /* In 16-wide dispatch we have an issue where a compressed - * instruction is actually two instructions executed simultaneiously. - * It's actually ok to have the source and destination registers be - * the same. In this case, each instruction over-writes its own - * source and there's no problem. The real problem here is if the - * source and destination registers are off by one. Then you can end - * up in a scenario where the first instruction over-writes the - * source of the second instruction. Since the compiler doesn't know - * about this level of granularity, we simply make the source and - * destination interfere. + /* In 16-wide instructions we have an issue where a compressed + * instruction is actually two instructions executed simultaneously. + * It's actually ok to have the source and destination registers be + * the same. In this case, each instruction over-writes its own + * source and there's no problem. The real problem here is if the + * source and destination registers are off by one. Then you can end + * up in a scenario where the first instruction over-writes the + * source of the second instruction. Since the compiler doesn't know + * about this level of granularity, we simply make the source and + * destination interfere. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->exec_size < 16 || inst->dst.file != VGRF) + continue; + + for (int i = 0; i < inst->sources; ++i) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); + } + } + } + + if (devinfo->gen >= 8) { + /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", + * subsection "EUISA Instructions", Send Message (page 990): + * + * "r127 must not be used for return address when there is a src and + * dest overlap in send instruction." + * + * We are avoiding using grf127 as part of the destination of send + * messages adding a node interference to the grf127_send_hack_node. + * This node has a fixed asignment to grf127. + * + * We don't apply it to SIMD16 instructions because previous code avoids + * any register overlap between sources and destination. */ + ra_set_node_reg(g, grf127_send_hack_node, 127); foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->dst.file != VGRF) - continue; + if (inst->exec_size < 16 && inst->is_send_from_grf() && + inst->dst.file == VGRF) + ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); + } - for (int i = 0; i < inst->sources; ++i) { - if (inst->src[i].file == VGRF) { - ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); + if (spilled_any_registers) { + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* Spilling instruction are genereated as SEND messages from MRF + * but as Gen7+ supports sending from GRF the driver will maps + * assingn these MRF registers to a GRF. Implementations reuses + * the dest of the send message as source. So as we will have an + * overlap for sure, we create an interference between destination + * and grf127. + */ + if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ || + inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) && + inst->dst.file == VGRF) + ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); + } + } + } + + /* From the Skylake PRM Vol. 2a docs for sends: + * + * "It is required that the second block of GRFs does not overlap with + * the first block." + * + * Normally, this is taken care of by fixup_sends_duplicate_payload() but + * in the case where one of the registers is an undefined value, the + * register allocator may decide that they don't interfere even though + * they're used as sources in the same instruction. We also need to add + * interference here. + */ + if (devinfo->gen >= 9) { + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && + inst->src[2].file == VGRF && + inst->src[3].file == VGRF && + inst->src[2].nr != inst->src[3].nr) { + for (unsigned i = 0; i < inst->mlen; i++) { + for (unsigned j = 0; j < inst->ex_mlen; j++) { + ra_add_node_interference(g, inst->src[2].nr + i, + inst->src[3].nr + j); + } } } } @@ -864,15 +932,27 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) } for (unsigned i = 0; i < this->alloc.count; i++) { + int live_length = virtual_grf_end[i] - virtual_grf_start[i]; + if (live_length <= 0) + continue; + + /* Divide the cost (in number of spills/fills) by the log of the length + * of the live range of the register. This will encourage spill logic + * to spill long-living things before spilling short-lived things where + * spilling is less likely to actually do us any good. We use the log + * of the length because it will fall off very quickly and not cause us + * to spill medium length registers with more uses. + */ + float adjusted_cost = spill_costs[i] / logf(live_length); if (!no_spill[i]) - ra_set_node_spill_cost(g, i, spill_costs[i]); + ra_set_node_spill_cost(g, i, adjusted_cost); } return ra_get_best_spill_node(g); } void -fs_visitor::spill_reg(int spill_reg) +fs_visitor::spill_reg(unsigned spill_reg) { int size = alloc.sizes[spill_reg]; unsigned int spill_offset = last_scratch; @@ -986,7 +1066,7 @@ fs_visitor::spill_reg(int spill_reg) * write, there should be no need for the unspill since the * instruction will be overwriting the whole destination in any case. */ - if (inst->is_partial_write() || + if (inst->is_partial_reg_write() || (!inst->force_writemask_all && !per_channel)) emit_unspill(ubld, spill_src, subset_spill_offset, regs_written(inst));