X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs.cpp;h=ea10e522b00de48e71548e9b1d45511a22ab05b2;hb=HEAD;hp=0d3d6137058f66dc4fb5e4a23e154f9a07dd40f0;hpb=d1c4e64a69e49c64148529024ecb700d18d3c1c8;p=mesa.git diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0d3d6137058..ea10e522b00 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -31,6 +31,7 @@ #include "main/macros.h" #include "brw_eu.h" #include "brw_fs.h" +#include "brw_fs_live_variables.h" #include "brw_nir.h" #include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" @@ -207,7 +208,7 @@ fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) * dependencies, and to avoid having to deal with aligning its regs to 2. */ const fs_builder ubld = bld.annotate("send dependency resolve") - .half(0); + .quarter(0); ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F)); } @@ -427,34 +428,6 @@ fs_inst::has_source_and_destination_hazard() const } } -bool -fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const -{ - if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD) - return false; - - fs_reg reg = this->src[0]; - if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1) - return false; - - if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written) - return false; - - for (int i = 0; i < this->sources; i++) { - reg.type = this->src[i].type; - if (!this->src[i].equals(reg)) - return false; - - if (i < this->header_size) { - reg.offset += REG_SIZE; - } else { - reg = horiz_offset(reg, this->exec_size); - } - } - - return true; -} - bool fs_inst::can_do_source_mods(const struct gen_device_info *devinfo) const { @@ -564,7 +537,22 @@ fs_reg::negative_equals(const fs_reg &r) const bool fs_reg::is_contiguous() const { - return stride == 1; + switch (file) { + case ARF: + case FIXED_GRF: + return hstride == BRW_HORIZONTAL_STRIDE_1 && + vstride == width + hstride; + case MRF: + case VGRF: + case ATTR: + return stride == 1; + case UNIFORM: + case IMM: + case BAD_FILE: + return true; + } + + unreachable("Invalid register file"); } unsigned @@ -576,57 +564,6 @@ fs_reg::component_size(unsigned width) const return MAX2(width * stride, 1) * type_sz(type); } -extern "C" int -type_size_scalar(const struct glsl_type *type, bool bindless) -{ - unsigned int size, i; - - switch (type->base_type) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - case GLSL_TYPE_FLOAT: - case GLSL_TYPE_BOOL: - return type->components(); - case GLSL_TYPE_UINT16: - case GLSL_TYPE_INT16: - case GLSL_TYPE_FLOAT16: - return DIV_ROUND_UP(type->components(), 2); - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - return DIV_ROUND_UP(type->components(), 4); - case GLSL_TYPE_DOUBLE: - case GLSL_TYPE_UINT64: - case GLSL_TYPE_INT64: - return type->components() * 2; - case GLSL_TYPE_ARRAY: - return type_size_scalar(type->fields.array, bindless) * type->length; - case GLSL_TYPE_STRUCT: - case GLSL_TYPE_INTERFACE: - size = 0; - for (i = 0; i < type->length; i++) { - size += type_size_scalar(type->fields.structure[i].type, bindless); - } - return size; - case GLSL_TYPE_SAMPLER: - case GLSL_TYPE_IMAGE: - if (bindless) - return type->components() * 2; - case GLSL_TYPE_ATOMIC_UINT: - /* Samplers, atomics, and images take up no register space, since - * they're baked in at link time. - */ - return 0; - case GLSL_TYPE_SUBROUTINE: - return 1; - case GLSL_TYPE_VOID: - case GLSL_TYPE_ERROR: - case GLSL_TYPE_FUNCTION: - unreachable("not reached"); - } - - return 0; -} - /** * Create a MOV to read the timestamp register. */ @@ -738,7 +675,8 @@ fs_visitor::vfail(const char *format, va_list va) failed = true; msg = ralloc_vasprintf(mem_ctx, format, va); - msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); + msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n", + dispatch_width, stage_abbrev, msg); this->fail_msg = msg; @@ -1150,9 +1088,11 @@ fs_inst::flags_written() const opcode != BRW_OPCODE_CSEL && opcode != BRW_OPCODE_IF && opcode != BRW_OPCODE_WHILE)) || - opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || opcode == FS_OPCODE_FB_WRITE) { return flag_mask(this, 1); + } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || + opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) { + return flag_mask(this, 32); } else { return flag_mask(dst, size_written); } @@ -1164,16 +1104,16 @@ fs_inst::flags_written() const * Note that this is not the 0 or 1 implied writes in an actual gen * instruction -- the FS opcodes often generate MOVs in addition. */ -int -fs_visitor::implied_mrf_writes(const fs_inst *inst) const +unsigned +fs_inst::implied_mrf_writes() const { - if (inst->mlen == 0) + if (mlen == 0) return 0; - if (inst->base_mrf == -1) + if (base_mrf == -1) return 0; - switch (inst->opcode) { + switch (opcode) { case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: case SHADER_OPCODE_SQRT: @@ -1181,11 +1121,11 @@ fs_visitor::implied_mrf_writes(const fs_inst *inst) const case SHADER_OPCODE_LOG2: case SHADER_OPCODE_SIN: case SHADER_OPCODE_COS: - return 1 * dispatch_width / 8; + return 1 * exec_size / 8; case SHADER_OPCODE_POW: case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: - return 2 * dispatch_width / 8; + return 2 * exec_size / 8; case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: @@ -1201,14 +1141,14 @@ fs_visitor::implied_mrf_writes(const fs_inst *inst) const return 1; case FS_OPCODE_FB_WRITE: case FS_OPCODE_REP_FB_WRITE: - return inst->src[0].file == BAD_FILE ? 0 : 2; + return src[0].file == BAD_FILE ? 0 : 2; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case SHADER_OPCODE_GEN4_SCRATCH_READ: return 1; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: - return inst->mlen; + return mlen; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: - return inst->mlen; + return mlen; default: unreachable("not reached"); } @@ -1219,7 +1159,7 @@ fs_visitor::vgrf(const glsl_type *const type) { int reg_width = dispatch_width / 8; return fs_reg(VGRF, - alloc.allocate(type_size_scalar(type, false) * reg_width), + alloc.allocate(glsl_count_dword_slots(type, false) * reg_width), brw_type_for_base_type(type)); } @@ -1251,6 +1191,8 @@ fs_visitor::import_uniforms(fs_visitor *v) this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; this->subgroup_id = v->subgroup_id; + for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++) + this->group_size[i] = v->group_size[i]; } void @@ -1593,7 +1535,7 @@ fs_visitor::emit_discard_jump() * shader if all relevant channels have been discarded. */ fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP); - discard_jump->flag_subreg = 1; + discard_jump->flag_subreg = sample_mask_flag_subreg(this); discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; discard_jump->predicate_inverse = true; @@ -1663,6 +1605,8 @@ fs_visitor::assign_curb_setup() prog_data->curb_read_length = uniform_push_length + ubo_push_length; + uint64_t used = 0; + /* Map the offsets in the UNIFORM file to fixed HW regs. */ foreach_block_and_inst(block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) { @@ -1684,6 +1628,9 @@ fs_visitor::assign_curb_setup() constant_nr = 0; } + assert(constant_nr / 8 < 64); + used |= BITFIELD64_BIT(constant_nr / 8); + struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + constant_nr / 8, constant_nr % 8); @@ -1698,10 +1645,68 @@ fs_visitor::assign_curb_setup() } } + uint64_t want_zero = used & stage_prog_data->zero_push_reg; + if (want_zero) { + assert(!compiler->compact_params); + fs_builder ubld = bld.exec_all().group(8, 0).at( + cfg->first_block(), cfg->first_block()->start()); + + /* push_reg_mask_param is in 32-bit units */ + unsigned mask_param = stage_prog_data->push_reg_mask_param; + struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8, + mask_param % 8); + + fs_reg b32; + for (unsigned i = 0; i < 64; i++) { + if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) { + fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2); + ubld.SHL(horiz_offset(shifted, 8), + byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8), + brw_imm_v(0x01234567)); + ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8)); + + fs_builder ubld16 = ubld.group(16, 0); + b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D); + ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15)); + } + + if (want_zero & BITFIELD64_BIT(i)) { + assert(i < prog_data->curb_read_length); + struct brw_reg push_reg = + retype(brw_vec8_grf(payload.num_regs + i, 0), + BRW_REGISTER_TYPE_D); + + ubld.AND(push_reg, push_reg, component(b32, i % 16)); + } + } + + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + } + /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length; } +/* + * Build up an array of indices into the urb_setup array that + * references the active entries of the urb_setup array. + * Used to accelerate walking the active entries of the urb_setup array + * on each upload. + */ +void +brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data) +{ + /* Make sure uint8_t is sufficient */ + STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff); + uint8_t index = 0; + for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (wm_prog_data->urb_setup[attr] >= 0) { + wm_prog_data->urb_setup_attribs[index++] = attr; + } + } + wm_prog_data->urb_setup_attribs_count = index; +} + static void calculate_urb_setup(const struct gen_device_info *devinfo, const struct brw_wm_prog_key *key, @@ -1740,7 +1745,7 @@ calculate_urb_setup(const struct gen_device_info *devinfo, struct brw_vue_map prev_stage_vue_map; brw_compute_vue_map(devinfo, &prev_stage_vue_map, key->input_slots_valid, - nir->info.separate_shader); + nir->info.separate_shader, 1); int first_slot = brw_compute_first_urb_slot_required(nir->info.inputs_read, @@ -1789,6 +1794,9 @@ calculate_urb_setup(const struct gen_device_info *devinfo, } prog_data->num_varying_inputs = urb_next; + prog_data->inputs = nir->info.inputs_read; + + brw_compute_urb_setup_index(prog_data); } void @@ -2090,7 +2098,7 @@ fs_visitor::split_virtual_grfs() } } } - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); delete[] split_points; delete[] new_virtual_grf; @@ -2098,7 +2106,7 @@ fs_visitor::split_virtual_grfs() } /** - * Remove unused virtual GRFs and compact the virtual_grf_* arrays. + * Remove unused virtual GRFs and compact the vgrf_* arrays. * * During code generation, we create tons of temporary variables, many of * which get immediately killed and are never used again. Yet, in later @@ -2135,7 +2143,7 @@ fs_visitor::compact_virtual_grfs() } else { remap_table[i] = new_index; alloc.sizes[new_index] = alloc.sizes[i]; - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); ++new_index; } } @@ -2308,7 +2316,7 @@ fs_visitor::assign_constant_locations() } if (compiler->compact_params) { - struct uniform_slot_info slots[uniforms]; + struct uniform_slot_info slots[uniforms + 1]; memset(slots, 0, sizeof(slots)); foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { @@ -2598,7 +2606,7 @@ fs_visitor::lower_constant_loads() inst->remove(block); } } - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } bool @@ -2609,7 +2617,8 @@ fs_visitor::opt_algebraic() foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { switch (inst->opcode) { case BRW_OPCODE_MOV: - if (!devinfo->has_64bit_types && + if (!devinfo->has_64bit_float && + !devinfo->has_64bit_int && (inst->dst.type == BRW_REGISTER_TYPE_DF || inst->dst.type == BRW_REGISTER_TYPE_UQ || inst->dst.type == BRW_REGISTER_TYPE_Q)) { @@ -2690,18 +2699,18 @@ fs_visitor::opt_algebraic() break; } - if (inst->src[0].file == IMM) { - assert(inst->src[0].type == BRW_REGISTER_TYPE_F); + break; + case BRW_OPCODE_ADD: + if (inst->src[1].file != IMM) + continue; + + if (brw_reg_type_is_integer(inst->src[1].type) && + inst->src[1].is_zero()) { inst->opcode = BRW_OPCODE_MOV; - inst->src[0].f *= inst->src[1].f; inst->src[1] = reg_undef; progress = true; break; } - break; - case BRW_OPCODE_ADD: - if (inst->src[1].file != IMM) - continue; if (inst->src[0].file == IMM) { assert(inst->src[0].type == BRW_REGISTER_TYPE_F); @@ -2742,7 +2751,8 @@ fs_visitor::opt_algebraic() } break; case BRW_OPCODE_SEL: - if (!devinfo->has_64bit_types && + if (!devinfo->has_64bit_float && + !devinfo->has_64bit_int && (inst->dst.type == BRW_REGISTER_TYPE_DF || inst->dst.type == BRW_REGISTER_TYPE_UQ || inst->dst.type == BRW_REGISTER_TYPE_Q)) { @@ -2874,6 +2884,11 @@ fs_visitor::opt_algebraic() } } } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_INSTRUCTION_DETAIL); + return progress; } @@ -2923,107 +2938,11 @@ fs_visitor::opt_zero_samples() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); return progress; } -/** - * Optimize sample messages which are followed by the final RT write. - * - * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its - * results sent directly to the framebuffer, bypassing the EU. Recognize the - * final texturing results copied to the framebuffer write payload and modify - * them to write to the framebuffer directly. - */ -bool -fs_visitor::opt_sampler_eot() -{ - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - - if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16) - return false; - - if (devinfo->gen != 9 && !devinfo->is_cherryview) - return false; - - /* FINISHME: It should be possible to implement this optimization when there - * are multiple drawbuffers. - */ - if (key->nr_color_regions != 1) - return false; - - /* Requires emitting a bunch of saturating MOV instructions during logical - * send lowering to clamp the color payload, which the sampler unit isn't - * going to do for us. - */ - if (key->clamp_fragment_color) - return false; - - /* Look for a texturing instruction immediately before the final FB_WRITE. */ - bblock_t *block = cfg->blocks[cfg->num_blocks - 1]; - fs_inst *fb_write = (fs_inst *)block->end(); - assert(fb_write->eot); - assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL); - - /* There wasn't one; nothing to do. */ - if (unlikely(fb_write->prev->is_head_sentinel())) - return false; - - fs_inst *tex_inst = (fs_inst *) fb_write->prev; - - /* 3D Sampler » Messages » Message Format - * - * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler - * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*” - */ - if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL && - tex_inst->opcode != FS_OPCODE_TXB_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL) - return false; - - /* XXX - This shouldn't be necessary. */ - if (tex_inst->prev->is_head_sentinel()) - return false; - - /* Check that the FB write sources are fully initialized by the single - * texturing instruction. - */ - for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) { - if (i == FB_WRITE_LOGICAL_SRC_COLOR0) { - if (!fb_write->src[i].equals(tex_inst->dst) || - fb_write->size_read(i) != tex_inst->size_written) - return false; - } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) { - if (fb_write->src[i].file != BAD_FILE) - return false; - } - } - - assert(!tex_inst->eot); /* We can't get here twice */ - assert((tex_inst->offset & (0xff << 24)) == 0); - - const fs_builder ibld(this, block, tex_inst); - - tex_inst->offset |= fb_write->target << 24; - tex_inst->eot = true; - tex_inst->dst = ibld.null_reg_ud(); - tex_inst->size_written = 0; - fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); - - /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT - * flag and submit a header together with the sampler message as required - * by the hardware. - */ - invalidate_live_intervals(); - return true; -} - bool fs_visitor::opt_register_renaming() { @@ -3073,7 +2992,8 @@ fs_visitor::opt_register_renaming() } if (progress) { - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_VARIABLES); for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) { @@ -3121,7 +3041,7 @@ fs_visitor::opt_redundant_discard_jumps() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -3142,107 +3062,6 @@ mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) return ((1 << n) - 1) << shift; } -bool -fs_visitor::opt_peephole_csel() -{ - if (devinfo->gen < 8) - return false; - - bool progress = false; - - foreach_block_reverse(block, cfg) { - int ip = block->end_ip + 1; - - foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { - ip--; - - if (inst->opcode != BRW_OPCODE_SEL || - inst->predicate != BRW_PREDICATE_NORMAL || - (inst->dst.type != BRW_REGISTER_TYPE_F && - inst->dst.type != BRW_REGISTER_TYPE_D && - inst->dst.type != BRW_REGISTER_TYPE_UD)) - continue; - - /* Because it is a 3-src instruction, CSEL cannot have an immediate - * value as a source, but we can sometimes handle zero. - */ - if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR && - inst->src[0].file != UNIFORM) || - (inst->src[1].file != VGRF && inst->src[1].file != ATTR && - inst->src[1].file != UNIFORM && !inst->src[1].is_zero())) - continue; - - foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (!scan_inst->flags_written()) - continue; - - if ((scan_inst->opcode != BRW_OPCODE_CMP && - scan_inst->opcode != BRW_OPCODE_MOV) || - scan_inst->predicate != BRW_PREDICATE_NONE || - (scan_inst->src[0].file != VGRF && - scan_inst->src[0].file != ATTR && - scan_inst->src[0].file != UNIFORM) || - scan_inst->src[0].type != BRW_REGISTER_TYPE_F) - break; - - if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero()) - break; - - const brw::fs_builder ibld(this, block, inst); - - const enum brw_conditional_mod cond = - inst->predicate_inverse - ? brw_negate_cmod(scan_inst->conditional_mod) - : scan_inst->conditional_mod; - - fs_inst *csel_inst = NULL; - - if (inst->src[1].file != IMM) { - csel_inst = ibld.CSEL(inst->dst, - inst->src[0], - inst->src[1], - scan_inst->src[0], - cond); - } else if (cond == BRW_CONDITIONAL_NZ) { - /* Consider the sequence - * - * cmp.nz.f0 null<1>F g3<8,8,1>F 0F - * (+f0) sel g124<1>UD g2<8,8,1>UD 0x00000000UD - * - * The sel will pick the immediate value 0 if r0 is ±0.0. - * Therefore, this sequence is equivalent: - * - * cmp.nz.f0 null<1>F g3<8,8,1>F 0F - * (+f0) sel g124<1>F g2<8,8,1>F (abs)g3<8,8,1>F - * - * The abs is ensures that the result is 0UD when g3 is -0.0F. - * By normal cmp-sel merging, this is also equivalent: - * - * csel.nz g124<1>F g2<4,4,1>F (abs)g3<4,4,1>F g3<4,4,1>F - */ - csel_inst = ibld.CSEL(inst->dst, - inst->src[0], - scan_inst->src[0], - scan_inst->src[0], - cond); - - csel_inst->src[1].abs = true; - } - - if (csel_inst != NULL) { - progress = true; - csel_inst->saturate = inst->saturate; - inst->remove(block); - } - - break; - } - } - } - - return progress; -} - bool fs_visitor::compute_to_mrf() { @@ -3253,7 +3072,7 @@ fs_visitor::compute_to_mrf() if (devinfo->gen >= 7) return false; - calculate_live_intervals(); + const fs_live_variables &live = live_analysis.require(); foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { int ip = next_ip; @@ -3271,7 +3090,7 @@ fs_visitor::compute_to_mrf() /* Can't compute-to-MRF this GRF if someone else was going to * read it later. */ - if (this->virtual_grf_end[inst->src[0].nr] > ip) + if (live.vgrf_end[inst->src[0].nr] > ip) continue; /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the @@ -3416,7 +3235,7 @@ fs_visitor::compute_to_mrf() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -3473,6 +3292,9 @@ fs_visitor::eliminate_find_live_channel() } } + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); + return progress; } @@ -3595,7 +3417,7 @@ fs_visitor::remove_duplicate_mrf_writes() /* Found a SEND instruction, which will include two or fewer * implied MRF writes. We could do better here. */ - for (int i = 0; i < implied_mrf_writes(inst); i++) { + for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { last_mrf_move[inst->base_mrf + i] = NULL; } } @@ -3619,7 +3441,7 @@ fs_visitor::remove_duplicate_mrf_writes() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -3668,7 +3490,7 @@ fs_visitor::remove_extra_rounding_modes() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -3849,7 +3671,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } /** @@ -3889,7 +3711,7 @@ fs_visitor::lower_uniform_pull_constant_loads() inst->header_size = 1; inst->mlen = 1; - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); } else { /* Before register allocation, we didn't tell the scheduler about the * MRF we use. We know it's safe to use this MRF because nothing @@ -3920,15 +3742,23 @@ fs_visitor::lower_load_payload() dst.nr = dst.nr & ~BRW_MRF_COMPR4; const fs_builder ibld(this, block, inst); - const fs_builder hbld = ibld.exec_all().group(8, 0); + const fs_builder ubld = ibld.exec_all(); - for (uint8_t i = 0; i < inst->header_size; i++) { - if (inst->src[i].file != BAD_FILE) { - fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); - fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); - hbld.MOV(mov_dst, mov_src); - } - dst = offset(dst, hbld, 1); + for (uint8_t i = 0; i < inst->header_size;) { + /* Number of header GRFs to initialize at once with a single MOV + * instruction. + */ + const unsigned n = + (i + 1 < inst->header_size && inst->src[i].stride == 1 && + inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ? + 2 : 1; + + if (inst->src[i].file != BAD_FILE) + ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD), + retype(inst->src[i], BRW_REGISTER_TYPE_UD)); + + dst = byte_offset(dst, n * REG_SIZE); + i += n; } if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && @@ -3960,9 +3790,9 @@ fs_visitor::lower_load_payload() } else { /* Platform doesn't have COMPR4. We have to fake it */ fs_reg mov_dst = retype(dst, inst->src[i].type); - ibld.half(0).MOV(mov_dst, half(inst->src[i], 0)); + ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0)); mov_dst.nr += 4; - ibld.half(1).MOV(mov_dst, half(inst->src[i], 1)); + ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1)); } } @@ -3999,7 +3829,7 @@ fs_visitor::lower_load_payload() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -4009,7 +3839,10 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) { const fs_builder ibld(this, block, inst); - if (inst->src[1].file == IMM && inst->src[1].ud < (1 << 16)) { + const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); + if (inst->src[1].file == IMM && + (( ud && inst->src[1].ud <= UINT16_MAX) || + (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) { /* The MUL instruction isn't commutative. On Gen <= 6, only the low * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of * src1 are used. @@ -4022,7 +3855,6 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) ibld.MOV(imm, inst->src[1]); ibld.MUL(inst->dst, imm, inst->src[0]); } else { - const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); ibld.MUL(inst->dst, inst->src[0], ud ? brw_imm_uw(inst->src[1].ud) : brw_imm_w(inst->src[1].d)); @@ -4099,7 +3931,20 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) high.offset = inst->dst.offset % REG_SIZE; if (devinfo->gen >= 7) { - if (inst->src[1].abs) + /* From GEN:BUG:1604601757: + * + * "When multiplying a DW and any lower precision integer, source modifier + * is not supported." + * + * An unsupported negate modifier on src[1] would ordinarily be + * lowered by the subsequent lower_regioning pass. In this case that + * pass would spawn another dword multiply. Instead, lower the + * modifier first. + */ + const bool source_mods_unsupported = (devinfo->gen >= 12); + + if (inst->src[1].abs || (inst->src[1].negate && + source_mods_unsupported)) lower_src_modifiers(this, block, inst, 1); if (inst->src[1].file == IMM) { @@ -4260,6 +4105,17 @@ fs_visitor::lower_integer_multiplication() foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { if (inst->opcode == BRW_OPCODE_MUL) { + /* If the instruction is already in a form that does not need lowering, + * return early. + */ + if (devinfo->gen >= 7) { + if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4) + continue; + } else { + if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4) + continue; + } + if ((inst->dst.type == BRW_REGISTER_TYPE_Q || inst->dst.type == BRW_REGISTER_TYPE_UQ) && (inst->src[0].type == BRW_REGISTER_TYPE_Q || @@ -4286,7 +4142,7 @@ fs_visitor::lower_integer_multiplication() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -4316,11 +4172,123 @@ fs_visitor::lower_minmax() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +bool +fs_visitor::lower_sub_sat() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == SHADER_OPCODE_USUB_SAT || + inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* The fundamental problem is the hardware performs source negation + * at the bit width of the source. If the source is 0x80000000D, the + * negation is 0x80000000D. As a result, subtractSaturate(0, + * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There + * are at least three ways to resolve this: + * + * 1. Use the accumulator for the negated source. The accumulator is + * 33 bits, so our source 0x80000000 is sign-extended to + * 0x1800000000. The negation of which is 0x080000000. This + * doesn't help for 64-bit integers (which are already bigger than + * 33 bits). There are also only 8 accumulators, so SIMD16 or + * SIMD32 instructions would have to be split into multiple SIMD8 + * instructions. + * + * 2. Use slightly different math. For any n-bit value x, we know (x + * >> 1) != -(x >> 1). We can use this fact to only do + * subtractions involving (x >> 1). subtractSaturate(a, b) == + * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). + * + * 3. For unsigned sources, it is sufficient to replace the + * subtractSaturate with (a > b) ? a - b : 0. + * + * It may also be possible to use the SUBB instruction. This + * implicitly writes the accumulator, so it could only be used in the + * same situations as #1 above. It is further limited by only + * allowing UD sources. + */ + if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q && + inst->src[0].type != BRW_REGISTER_TYPE_UQ) { + fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type); + + ibld.MOV(acc, inst->src[1]); + fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); + add->saturate = true; + add->src[0].negate = true; + } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* tmp = src1 >> 1; + * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); + */ + fs_reg tmp1 = ibld.vgrf(inst->src[0].type); + fs_reg tmp2 = ibld.vgrf(inst->src[0].type); + fs_reg tmp3 = ibld.vgrf(inst->src[0].type); + fs_inst *add; + + ibld.SHR(tmp1, inst->src[1], brw_imm_d(1)); + + add = ibld.ADD(tmp2, inst->src[1], tmp1); + add->src[1].negate = true; + + add = ibld.ADD(tmp3, inst->src[0], tmp1); + add->src[1].negate = true; + add->saturate = true; + + add = ibld.ADD(inst->dst, tmp3, tmp2); + add->src[1].negate = true; + add->saturate = true; + } else { + /* a > b ? a - b : 0 */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + BRW_CONDITIONAL_G); + + fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); + add->src[1].negate = !add->src[1].negate; + + ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) + ->predicate = BRW_PREDICATE_NORMAL; + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } +/** + * Get the mask of SIMD channels enabled during dispatch and not yet disabled + * by discard. Due to the layout of the sample mask in the fragment shader + * thread payload, \p bld is required to have a dispatch_width() not greater + * than 16 for fragment shaders. + */ +static fs_reg +sample_mask_reg(const fs_builder &bld) +{ + const fs_visitor *v = static_cast(bld.shader); + + if (v->stage != MESA_SHADER_FRAGMENT) { + return brw_imm_ud(0xffffffff); + } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) { + assert(bld.dispatch_width() <= 16); + return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16); + } else { + assert(v->devinfo->gen >= 6 && bld.dispatch_width() <= 16); + return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7), + BRW_REGISTER_TYPE_UW); + } +} + static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -4390,6 +4358,8 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, const unsigned components = inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; + assert(inst->target != 0 || src0_alpha.file == BAD_FILE); + /* We can potentially have a message length of up to 15, so we have to set * base_mrf to either 0 or 1 in order to fit in m0..m15. */ @@ -4416,7 +4386,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, if (prog_data->uses_kill) { bld.exec_all().group(1, 0) .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), - brw_flag_reg(0, 1)); + sample_mask_reg(bld)); } assert(length == 0); @@ -4447,6 +4417,9 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD), }; ubld.LOAD_PAYLOAD(header, header_sources, 2, 0); + + /* Gen12 will require additional fix-ups if we ever hit this path. */ + assert(devinfo->gen < 12); } uint32_t g00_bits = 0; @@ -4454,7 +4427,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, /* Set "Source0 Alpha Present to RenderTarget" bit in message * header. */ - if (inst->target > 0 && prog_data->replicate_alpha) + if (src0_alpha.file != BAD_FILE) g00_bits |= 1 << 11; /* Set computes stencil to render target */ @@ -4475,10 +4448,9 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, } if (prog_data->uses_kill) { - assert(bld.group() < 16); ubld.group(1, 0).MOV(retype(component(header, 15), BRW_REGISTER_TYPE_UW), - brw_flag_reg(0, 1)); + sample_mask_reg(bld)); } assert(length == 0); @@ -4498,8 +4470,6 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, length++; } - bool src0_alpha_present = false; - if (src0_alpha.file != BAD_FILE) { for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) { const fs_builder &ubld = bld.exec_all().group(8, i) @@ -4509,14 +4479,6 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, setup_color_payload(ubld, key, &sources[length], tmp, 1); length++; } - src0_alpha_present = true; - } else if (prog_data->replicate_alpha && inst->target != 0) { - /* Handle the case when fragment shader doesn't write to draw buffer - * zero. No need to call setup_color_payload() for src0_alpha because - * alpha value will be undefined. - */ - length += bld.dispatch_width() / 8; - src0_alpha_present = true; } if (sample_mask.file != BAD_FILE) { @@ -4598,7 +4560,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, /* Set the "Render Target Index" and "Src0 Alpha Present" fields * in the extended message descriptor, in lieu of using a header. */ - ex_desc = inst->target << 12 | src0_alpha_present << 15; + ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15; if (key->nr_color_regions == 0) ex_desc |= 1 << 20; /* Null Render Target */ @@ -4643,6 +4605,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, static void lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) { + const gen_device_info *devinfo = bld.shader->devinfo; const fs_builder &ubld = bld.exec_all().group(8, 0); const unsigned length = 2; const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length); @@ -4657,6 +4620,19 @@ lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD) }; ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0); + + if (devinfo->gen >= 12) { + /* On Gen12 the Viewport and Render Target Array Index fields (AKA + * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render + * target message header format was updated accordingly -- However + * the updated format only works for the lower 16 channels in a + * SIMD32 thread, since the higher 16 channels want the subspan data + * from r2 instead of r1, so we need to copy over the contents of + * r1.1 in order to fix things up. + */ + ubld.group(1, 0).MOV(component(header, 9), + retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD)); + } } inst->resize_sources(1); @@ -4696,7 +4672,8 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, if (coord_components > 0 && (has_lod || shadow_c.file != BAD_FILE || (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) { - for (unsigned i = coord_components; i < 3; i++) + assert(coord_components <= 3); + for (unsigned i = 0; i < 3 - coord_components; i++) bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f)); msg_end = offset(msg_end, bld, 3 - coord_components); @@ -5352,6 +5329,45 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) } } +/** + * Predicate the specified instruction on the sample mask. + */ +static void +emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst) +{ + assert(bld.shader->stage == MESA_SHADER_FRAGMENT && + bld.group() == inst->group && + bld.dispatch_width() == inst->exec_size); + + const fs_visitor *v = static_cast(bld.shader); + const fs_reg sample_mask = sample_mask_reg(bld); + const unsigned subreg = sample_mask_flag_subreg(v); + + if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) { + assert(sample_mask.file == ARF && + sample_mask.nr == brw_flag_subreg(subreg).nr && + sample_mask.subnr == brw_flag_subreg( + subreg + inst->group / 16).subnr); + } else { + bld.group(1, 0).exec_all() + .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask); + } + + if (inst->predicate) { + assert(inst->predicate == BRW_PREDICATE_NORMAL); + assert(!inst->predicate_inverse); + assert(inst->flag_subreg == 0); + /* Combine the sample mask with the existing predicate by using a + * vertical predication mode. + */ + inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; + } else { + inst->flag_subreg = subreg; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + } +} + static void lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5388,7 +5404,7 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); const bool has_side_effects = inst->has_side_effects(); - fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() : + fs_reg sample_mask = has_side_effects ? sample_mask_reg(bld) : fs_reg(brw_imm_d(0xffff)); /* From the BDW PRM Volume 7, page 147: @@ -5484,27 +5500,8 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) * provided. */ if ((header.file == BAD_FILE || !is_surface_access) && - sample_mask.file != BAD_FILE && sample_mask.file != IMM) { - const fs_builder ubld = bld.group(1, 0).exec_all(); - if (inst->predicate) { - assert(inst->predicate == BRW_PREDICATE_NORMAL); - assert(!inst->predicate_inverse); - assert(inst->flag_subreg < 2); - /* Combine the sample mask with the existing predicate by using a - * vertical predication mode. - */ - inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; - ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2), - sample_mask.type), - sample_mask); - } else { - inst->flag_subreg = 2; - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = false; - ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), - sample_mask); - } - } + sample_mask.file != BAD_FILE && sample_mask.file != IMM) + emit_predicate_on_sample_mask(bld, inst); uint32_t sfid; switch (inst->opcode) { @@ -5675,16 +5672,8 @@ lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) /* If the surface message has side effects and we're a fragment shader, we * have to predicate with the sample mask to avoid helper invocations. */ - if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) { - inst->flag_subreg = 2; - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = false; - - fs_reg sample_mask = bld.sample_mask_reg(); - const fs_builder ubld = bld.group(1, 0).exec_all(); - ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), - sample_mask); - } + if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) + emit_predicate_on_sample_mask(bld, inst); fs_reg payload, payload2; unsigned mlen, ex_mlen = 0; @@ -6002,7 +5991,7 @@ fs_visitor::lower_logical_sends() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -6244,7 +6233,7 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, /* Only power-of-two execution sizes are representable in the instruction * control fields. */ - return 1 << _mesa_logbase2(max_width); + return 1 << util_logbase2(max_width); } /** @@ -6336,6 +6325,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case BRW_OPCODE_SHR: case BRW_OPCODE_SHL: case BRW_OPCODE_ASR: + case BRW_OPCODE_ROR: + case BRW_OPCODE_ROL: case BRW_OPCODE_CMPN: case BRW_OPCODE_CSEL: case BRW_OPCODE_F32TO16: @@ -6360,6 +6351,7 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case FS_OPCODE_PACK: case SHADER_OPCODE_SEL_EXEC: case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_MOV_RELOC_IMM: return get_fpu_lowered_simd_width(devinfo, inst); case BRW_OPCODE_CMP: { @@ -6419,6 +6411,10 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, return MIN2(16, inst->exec_size); } + case SHADER_OPCODE_USUB_SAT: + case SHADER_OPCODE_ISUB_SAT: + return get_fpu_lowered_simd_width(devinfo, inst); + case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: /* Integer division is limited to SIMD8 on all generations. */ @@ -6881,19 +6877,100 @@ fs_visitor::lower_simd_width() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } +/** + * Transform barycentric vectors into the interleaved form expected by the PLN + * instruction and returned by the Gen7+ PI shared function. + * + * For channels 0-15 in SIMD16 mode they are expected to be laid out as + * follows in the register file: + * + * rN+0: X[0-7] + * rN+1: Y[0-7] + * rN+2: X[8-15] + * rN+3: Y[8-15] + * + * There is no need to handle SIMD32 here -- This is expected to be run after + * SIMD lowering, since SIMD lowering relies on vectors having the standard + * component layout. + */ +bool +fs_visitor::lower_barycentrics() +{ + const bool has_interleaved_layout = devinfo->has_pln || devinfo->gen >= 7; + bool progress = false; + + if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->exec_size < 16) + continue; + + const fs_builder ibld(this, block, inst); + const fs_builder ubld = ibld.exec_all().group(8, 0); + + switch (inst->opcode) { + case FS_OPCODE_LINTERP : { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2); + fs_reg srcs[4]; + + for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) + srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2), + 8 * (i / 2)); + + ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs)); + + inst->src[0] = tmp; + progress = true; + break; + } + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->dst.type, 2); + + for (unsigned i = 0; i < 2; i++) { + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + fs_inst *mov = ibld.at(block, inst->next).group(8, g) + .MOV(horiz_offset(offset(inst->dst, ibld, i), + 8 * g), + offset(tmp, ubld, 2 * g + i)); + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + mov->flag_subreg = inst->flag_subreg; + } + } + + inst->dst = tmp; + progress = true; + break; + } + default: + break; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +void +fs_visitor::dump_instructions() const +{ + dump_instructions(NULL); +} + void -fs_visitor::dump_instructions() -{ - dump_instructions(NULL); -} - -void -fs_visitor::dump_instructions(const char *name) +fs_visitor::dump_instructions(const char *name) const { FILE *file = stderr; if (name && geteuid() != 0) { @@ -6903,11 +6980,11 @@ fs_visitor::dump_instructions(const char *name) } if (cfg) { - calculate_register_pressure(); - int ip = 0, max_pressure = 0; + const register_pressure &rp = regpressure_analysis.require(); + unsigned ip = 0, max_pressure = 0; foreach_block_and_inst(block, backend_instruction, inst, cfg) { - max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]); - fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip); + max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]); + fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip); dump_instruction(inst, file); ip++; } @@ -6926,15 +7003,15 @@ fs_visitor::dump_instructions(const char *name) } void -fs_visitor::dump_instruction(backend_instruction *be_inst) +fs_visitor::dump_instruction(const backend_instruction *be_inst) const { dump_instruction(be_inst, stderr); } void -fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) +fs_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const { - fs_inst *inst = (fs_inst *)be_inst; + const fs_inst *inst = (const fs_inst *)be_inst; if (inst->predicate) { fprintf(file, "(%cf%d.%d) ", @@ -7158,24 +7235,6 @@ fs_visitor::setup_fs_payload_gen6() assert(dispatch_width % payload_width == 0); assert(devinfo->gen >= 6); - prog_data->uses_src_depth = prog_data->uses_src_w = - (nir->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD)) != 0; - - prog_data->uses_sample_mask = - (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0; - - /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: - * - * "MSDISPMODE_PERSAMPLE is required in order to select - * POSOFFSET_SAMPLE" - * - * So we can only really get sample positions if we are doing real - * per-sample dispatch. If we need gl_SamplePosition and we don't have - * persample dispatch, we hard-code it to 0.5. - */ - prog_data->uses_pos_offset = prog_data->persample_dispatch && - (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); - /* R0: PS thread payload header. */ payload.num_regs++; @@ -7287,24 +7346,33 @@ fs_visitor::setup_cs_payload() payload.num_regs = 1; } -void -fs_visitor::calculate_register_pressure() +brw::register_pressure::register_pressure(const fs_visitor *v) { - invalidate_live_intervals(); - calculate_live_intervals(); - - unsigned num_instructions = 0; - foreach_block(block, cfg) - num_instructions += block->instructions.length(); + const fs_live_variables &live = v->live_analysis.require(); + const unsigned num_instructions = v->cfg->num_blocks ? + v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0; - regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions); + regs_live_at_ip = new unsigned[num_instructions](); - for (unsigned reg = 0; reg < alloc.count; reg++) { - for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++) - regs_live_at_ip[ip] += alloc.sizes[reg]; + for (unsigned reg = 0; reg < v->alloc.count; reg++) { + for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++) + regs_live_at_ip[ip] += v->alloc.sizes[reg]; } } +brw::register_pressure::~register_pressure() +{ + delete[] regs_live_at_ip; +} + +void +fs_visitor::invalidate_analysis(brw::analysis_dependency_class c) +{ + backend_shader::invalidate_analysis(c); + live_analysis.invalidate(c); + regpressure_analysis.invalidate(c); +} + void fs_visitor::optimize() { @@ -7396,12 +7464,6 @@ fs_visitor::optimize() OPT(compact_virtual_grfs); } while (progress); - /* Do this after cmod propagation has had every possible opportunity to - * propagate results into SEL instructions. - */ - if (OPT(opt_peephole_csel)) - OPT(dead_code_eliminate); - progress = false; pass_num = 0; @@ -7411,12 +7473,12 @@ fs_visitor::optimize() } OPT(lower_simd_width); - - /* After SIMD lowering just in case we had to unroll the EOT send. */ - OPT(opt_sampler_eot); - + OPT(lower_barycentrics); OPT(lower_logical_sends); + /* After logical SEND lowering. */ + OPT(fixup_nomask_control_flow); + if (progress) { OPT(opt_copy_propagation); /* Only run after logical send lowering because it's easier to implement @@ -7441,6 +7503,11 @@ fs_visitor::optimize() if (OPT(lower_load_payload)) { split_virtual_grfs(); + + /* Lower 64 bit MOVs generated by payload lowering. */ + if (!devinfo->has_64bit_float && !devinfo->has_64bit_int) + OPT(opt_algebraic); + OPT(register_coalesce); OPT(lower_simd_width); OPT(compute_to_mrf); @@ -7449,6 +7516,7 @@ fs_visitor::optimize() OPT(opt_combine_constants); OPT(lower_integer_multiplication); + OPT(lower_sub_sat); if (devinfo->gen <= 5 && OPT(lower_minmax)) { OPT(opt_cmod_propagation); @@ -7513,7 +7581,7 @@ fs_visitor::fixup_sends_duplicate_payload() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -7536,11 +7604,157 @@ fs_visitor::fixup_3src_null_dest() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_VARIABLES); +} + +/** + * Find the first instruction in the program that might start a region of + * divergent control flow due to a HALT jump. There is no + * find_halt_control_flow_region_end(), the region of divergence extends until + * the only FS_OPCODE_PLACEHOLDER_HALT in the program. + */ +static const fs_inst * +find_halt_control_flow_region_start(const fs_visitor *v) +{ + if (brw_wm_prog_data(v->prog_data)->uses_kill) { + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->opcode == FS_OPCODE_DISCARD_JUMP || + inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) + return inst; + } + } + + return NULL; +} + +/** + * Work around the Gen12 hardware bug filed as GEN:BUG:1407528679. EU fusion + * can cause a BB to be executed with all channels disabled, which will lead + * to the execution of any NoMask instructions in it, even though any + * execution-masked instructions will be correctly shot down. This may break + * assumptions of some NoMask SEND messages whose descriptor depends on data + * generated by live invocations of the shader. + * + * This avoids the problem by predicating certain instructions on an ANY + * horizontal predicate that makes sure that their execution is omitted when + * all channels of the program are disabled. + */ +bool +fs_visitor::fixup_nomask_control_flow() +{ + if (devinfo->gen != 12) + return false; + + const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H : + dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H : + BRW_PREDICATE_ALIGN1_ANY8H; + const fs_inst *halt_start = find_halt_control_flow_region_start(this); + unsigned depth = 0; + bool progress = false; + + const fs_live_variables &live_vars = live_analysis.require(); + + /* Scan the program backwards in order to be able to easily determine + * whether the flag register is live at any point. + */ + foreach_block_reverse_safe(block, cfg) { + BITSET_WORD flag_liveout = live_vars.block_data[block->num] + .flag_liveout[0]; + STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1); + + foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { + if (!inst->predicate && inst->exec_size >= 8) + flag_liveout &= ~inst->flags_written(); + + switch (inst->opcode) { + case BRW_OPCODE_DO: + case BRW_OPCODE_IF: + /* Note that this doesn't handle FS_OPCODE_DISCARD_JUMP since only + * the first one in the program closes the region of divergent + * control flow due to any HALT instructions -- Instead this is + * handled with the halt_start check below. + */ + depth--; + break; + + case BRW_OPCODE_WHILE: + case BRW_OPCODE_ENDIF: + case FS_OPCODE_PLACEHOLDER_HALT: + depth++; + break; + + default: + /* Note that the vast majority of NoMask SEND instructions in the + * program are harmless while executed in a block with all + * channels disabled, since any instructions with side effects we + * could hit here should be execution-masked. + * + * The main concern is NoMask SEND instructions where the message + * descriptor or header depends on data generated by live + * invocations of the shader (RESINFO and + * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically + * computed surface index seem to be the only examples right now + * where this could easily lead to GPU hangs). Unfortunately we + * have no straightforward way to detect that currently, so just + * predicate any NoMask SEND instructions we find under control + * flow. + * + * If this proves to have a measurable performance impact it can + * be easily extended with a whitelist of messages we know we can + * safely omit the predication for. + */ + if (depth && inst->force_writemask_all && + is_send(inst) && !inst->predicate) { + /* We need to load the execution mask into the flag register by + * using a builder with channel group matching the whole shader + * (rather than the default which is derived from the original + * instruction), in order to avoid getting a right-shifted + * value. + */ + const fs_builder ubld = fs_builder(this, block, inst) + .exec_all().group(dispatch_width, 0); + const fs_reg flag = retype(brw_flag_reg(0, 0), + BRW_REGISTER_TYPE_UD); + + /* Due to the lack of flag register allocation we need to save + * and restore the flag register if it's live. + */ + const bool save_flag = flag_liveout & + flag_mask(flag, dispatch_width / 8); + const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type); + + if (save_flag) + ubld.group(1, 0).MOV(tmp, flag); + + ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS); + + set_predicate(pred, inst); + inst->flag_subreg = 0; + + if (save_flag) + ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp); + + progress = true; + } + break; + } + + if (inst == halt_start) + depth--; + + flag_liveout |= inst->flags_read(devinfo); + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; } void -fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) +fs_visitor::allocate_registers(bool allow_spilling) { bool allocated; @@ -7572,12 +7786,25 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) break; } - /* We only allow spilling for the last schedule mode and only if the - * allow_spilling parameter and dispatch width work out ok. + /* Scheduling may create additional opportunities for CMOD propagation, + * so let's do it again. If CMOD propagation made any progress, + * eliminate dead code one more time. */ + bool progress = false; + const int iteration = 99; + int pass_num = 0; + + if (OPT(opt_cmod_propagation)) { + /* dead_code_eliminate "undoes" the fixing done by + * fixup_3src_null_dest, so we have to do it again if + * dead_code_eliminiate makes any progress. + */ + if (OPT(dead_code_eliminate)) + fixup_3src_null_dest(); + } + bool can_spill = allow_spilling && - (i == ARRAY_SIZE(pre_modes) - 1) && - dispatch_width == min_dispatch_width; + (i == ARRAY_SIZE(pre_modes) - 1); /* We should only spill registers on the last scheduling. */ assert(!spilled_any_registers); @@ -7588,20 +7815,8 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) } if (!allocated) { - if (!allow_spilling) - fail("Failure to register allocate and spilling is not allowed."); - - /* We assume that any spilling is worse than just dropping back to - * SIMD8. There's probably actually some intermediate point where - * SIMD16 with a couple of spills is still better. - */ - if (dispatch_width > min_dispatch_width) { - fail("Failure to register allocate. Reduce number of " - "live scalar values to avoid this."); - } - - /* If we failed to allocate, we must have a reason */ - assert(failed); + fail("Failure to register allocate. Reduce number of " + "live scalar values to avoid this."); } else if (spilled_any_registers) { compiler->shader_perf_log(log_data, "%s shader triggered register spilling. " @@ -7628,7 +7843,7 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) prog_data->total_scratch = brw_get_scratch_size(last_scratch); - if (stage == MESA_SHADER_COMPUTE) { + if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) { if (devinfo->is_haswell) { /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" * field documentation, Haswell supports a minimum of 2kB of @@ -7690,7 +7905,7 @@ fs_visitor::run_vs() assign_vs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(8, true); + allocate_registers(true /* allow_spilling */); return !failed; } @@ -7811,7 +8026,7 @@ fs_visitor::run_tcs() assign_tcs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(8, true); + allocate_registers(true /* allow_spilling */); return !failed; } @@ -7845,7 +8060,7 @@ fs_visitor::run_tes() assign_tes_urb_setup(); fixup_3src_null_dest(); - allocate_registers(8, true); + allocate_registers(true /* allow_spilling */); return !failed; } @@ -7894,7 +8109,7 @@ fs_visitor::run_gs() assign_gs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(8, true); + allocate_registers(true /* allow_spilling */); return !failed; } @@ -7922,6 +8137,8 @@ gen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data) wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; wm_prog_data->num_varying_inputs = 1; + + brw_compute_urb_setup_index(wm_prog_data); } bool @@ -7959,13 +8176,20 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) * Initialize it with the dispatched pixels. */ if (wm_prog_data->uses_kill) { - const fs_reg dispatch_mask = - devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0); - bld.exec_all().group(1, 0) - .MOV(retype(brw_flag_reg(0, 1), BRW_REGISTER_TYPE_UW), - retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); + const unsigned lower_width = MIN2(dispatch_width, 16); + for (unsigned i = 0; i < dispatch_width / lower_width; i++) { + const fs_reg dispatch_mask = + devinfo->gen >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) : + brw_vec1_grf(0, 0); + bld.exec_all().group(1, 0) + .MOV(sample_mask_reg(bld.group(lower_width, i)), + retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); + } } + if (nir->info.writes_memory) + wm_prog_data->has_side_effects = true; + emit_nir_code(); if (failed) @@ -7994,7 +8218,8 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) assign_urb_setup(); fixup_3src_null_dest(); - allocate_registers(8, allow_spilling); + + allocate_registers(allow_spilling); if (failed) return false; @@ -8004,10 +8229,9 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) } bool -fs_visitor::run_cs(unsigned min_dispatch_width) +fs_visitor::run_cs(bool allow_spilling) { - assert(stage == MESA_SHADER_COMPUTE); - assert(dispatch_width >= min_dispatch_width); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); setup_cs_payload(); @@ -8038,7 +8262,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width) assign_curb_setup(); fixup_3src_null_dest(); - allocate_registers(min_dispatch_width, true); + allocate_registers(allow_spilling); if (failed) return false; @@ -8126,7 +8350,7 @@ brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, { prog_data->flat_inputs = 0; - nir_foreach_variable(var, &shader->inputs) { + nir_foreach_shader_in_variable(var, shader) { unsigned slots = glsl_count_attribute_slots(var->type, false); for (unsigned s = 0; s < slots; s++) { int input_index = prog_data->urb_setup[var->data.location + s]; @@ -8175,8 +8399,8 @@ computed_depth_mode(const nir_shader *shader) * * This should be replaced by global value numbering someday. */ -static bool -move_interpolation_to_top(nir_shader *nir) +bool +brw_nir_move_interpolation_to_top(nir_shader *nir) { bool progress = false; @@ -8228,9 +8452,8 @@ move_interpolation_to_top(nir_shader *nir) } } } - nir_metadata_preserve(f->impl, (nir_metadata) - ((unsigned) nir_metadata_block_index | - (unsigned) nir_metadata_dominance)); + nir_metadata_preserve(f->impl, nir_metadata_block_index | + nir_metadata_dominance); } return progress; @@ -8241,8 +8464,8 @@ move_interpolation_to_top(nir_shader *nir) * * Useful when rendering to a non-multisampled buffer. */ -static bool -demote_sample_qualifiers(nir_shader *nir) +bool +brw_nir_demote_sample_qualifiers(nir_shader *nir) { bool progress = true; @@ -8274,14 +8497,71 @@ demote_sample_qualifiers(nir_shader *nir) } } - nir_metadata_preserve(f->impl, (nir_metadata) - ((unsigned) nir_metadata_block_index | - (unsigned) nir_metadata_dominance)); + nir_metadata_preserve(f->impl, nir_metadata_block_index | + nir_metadata_dominance); } return progress; } +void +brw_nir_populate_wm_prog_data(const nir_shader *shader, + const struct gen_device_info *devinfo, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data) +{ + prog_data->uses_src_depth = prog_data->uses_src_w = + shader->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_FRAG_COORD); + + /* key->alpha_test_func means simulating alpha testing via discards, + * so the shader definitely kills pixels. + */ + prog_data->uses_kill = shader->info.fs.uses_discard || + key->alpha_test_func; + prog_data->uses_omask = !key->ignore_sample_mask_out && + (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->persample_dispatch = + key->multisample_fbo && + (key->persample_interp || + (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | + SYSTEM_BIT_SAMPLE_POS)) || + shader->info.fs.uses_sample_qualifier || + shader->info.outputs_read); + + if (devinfo->gen >= 6) { + prog_data->uses_sample_mask = + shader->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN; + + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = prog_data->persample_dispatch && + (shader->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); + } + + prog_data->has_render_target_reads = shader->info.outputs_read != 0ull; + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; + prog_data->inner_coverage = shader->info.fs.inner_coverage; + + prog_data->barycentric_interp_modes = + brw_compute_barycentric_interp_modes(devinfo, shader); + + calculate_urb_setup(devinfo, key, prog_data, shader); + brw_compute_flat_inputs(prog_data, shader); +} + /** * Pre-gen6, the register file of the EUs was shared between threads, * and each thread used some subset allocated on a 16-register block @@ -8298,7 +8578,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - nir_shader *shader, + nir_shader *nir, int shader_time_index8, int shader_time_index16, int shader_time_index32, bool allow_spilling, bool use_rep_send, struct brw_vue_map *vue_map, @@ -8306,15 +8586,14 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, char **error_str) { const struct gen_device_info *devinfo = compiler->devinfo; + const unsigned max_subgroup_size = compiler->devinfo->gen >= 6 ? 32 : 16; - unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16; - - brw_nir_apply_key(shader, compiler, &key->base, max_subgroup_size, true); - brw_nir_lower_fs_inputs(shader, devinfo, key); - brw_nir_lower_fs_outputs(shader); + brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size, true); + brw_nir_lower_fs_inputs(nir, devinfo, key); + brw_nir_lower_fs_outputs(nir); if (devinfo->gen < 6) - brw_setup_vue_interpolation(vue_map, shader, prog_data); + brw_setup_vue_interpolation(vue_map, nir, prog_data); /* From the SKL PRM, Volume 7, "Alpha Coverage": * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in @@ -8325,97 +8604,98 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, * offset to determine render target 0 store instruction in * emit_alpha_to_coverage pass. */ - NIR_PASS_V(shader, nir_opt_constant_folding); - NIR_PASS_V(shader, brw_nir_lower_alpha_to_coverage); + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage); } if (!key->multisample_fbo) - NIR_PASS_V(shader, demote_sample_qualifiers); - NIR_PASS_V(shader, move_interpolation_to_top); - brw_postprocess_nir(shader, compiler, true); + NIR_PASS_V(nir, brw_nir_demote_sample_qualifiers); + NIR_PASS_V(nir, brw_nir_move_interpolation_to_top); + brw_postprocess_nir(nir, compiler, true); - /* key->alpha_test_func means simulating alpha testing via discards, - * so the shader definitely kills pixels. - */ - prog_data->uses_kill = shader->info.fs.uses_discard || - key->alpha_test_func; - prog_data->uses_omask = key->multisample_fbo && - shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); - prog_data->computed_depth_mode = computed_depth_mode(shader); - prog_data->computed_stencil = - shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); - - prog_data->persample_dispatch = - key->multisample_fbo && - (key->persample_interp || - (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | - SYSTEM_BIT_SAMPLE_POS)) || - shader->info.fs.uses_sample_qualifier || - shader->info.outputs_read); - - prog_data->has_render_target_reads = shader->info.outputs_read != 0ull; - - prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; - prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; - prog_data->inner_coverage = shader->info.fs.inner_coverage; - - prog_data->barycentric_interp_modes = - brw_compute_barycentric_interp_modes(compiler->devinfo, shader); - - calculate_urb_setup(devinfo, key, prog_data, shader); - brw_compute_flat_inputs(prog_data, shader); + brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data); + fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; + float throughput = 0; + bool has_spilled = false; - fs_visitor v8(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, 8, - shader_time_index8); - if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) { + v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, nir, 8, shader_time_index8); + if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { if (error_str) - *error_str = ralloc_strdup(mem_ctx, v8.fail_msg); + *error_str = ralloc_strdup(mem_ctx, v8->fail_msg); + delete v8; return NULL; } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { - simd8_cfg = v8.cfg; - prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; - prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used); + simd8_cfg = v8->cfg; + prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs; + prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used); + const performance &perf = v8->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v8->spilled_any_registers; + allow_spilling = false; + } + + /* Limit dispatch width to simd8 with dual source blending on gen8. + * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917 + */ + if (devinfo->gen == 8 && prog_data->dual_src_blend && + !(INTEL_DEBUG & DEBUG_NO8)) { + assert(!use_rep_send); + v8->limit_dispatch_width(8, "gen8 workaround: " + "using SIMD8 when dual src blending.\n"); } - if (v8.max_dispatch_width >= 16 && + if (!has_spilled && + v8->max_dispatch_width >= 16 && likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { /* Try a SIMD16 compile */ - fs_visitor v16(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, 16, - shader_time_index16); - v16.import_uniforms(&v8); - if (!v16.run_fs(allow_spilling, use_rep_send)) { + v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, nir, 16, shader_time_index16); + v16->import_uniforms(v8); + if (!v16->run_fs(allow_spilling, use_rep_send)) { compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", - v16.fail_msg); + v16->fail_msg); } else { - simd16_cfg = v16.cfg; - prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; - prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used); + simd16_cfg = v16->cfg; + prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs; + prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used); + const performance &perf = v16->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v16->spilled_any_registers; + allow_spilling = false; } } + const bool simd16_failed = v16 && !simd16_cfg; + /* Currently, the compiler only supports SIMD32 on SNB+ */ - if (v8.max_dispatch_width >= 32 && !use_rep_send && - compiler->devinfo->gen >= 6 && - unlikely(INTEL_DEBUG & DEBUG_DO32)) { + if (!has_spilled && + v8->max_dispatch_width >= 32 && !use_rep_send && + devinfo->gen >= 6 && !simd16_failed && + !(INTEL_DEBUG & DEBUG_NO32)) { /* Try a SIMD32 compile */ - fs_visitor v32(compiler, log_data, mem_ctx, &key->base, - &prog_data->base, shader, 32, - shader_time_index32); - v32.import_uniforms(&v8); - if (!v32.run_fs(allow_spilling, false)) { + v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, + &prog_data->base, nir, 32, shader_time_index32); + v32->import_uniforms(v8); + if (!v32->run_fs(allow_spilling, false)) { compiler->shader_perf_log(log_data, "SIMD32 shader failed to compile: %s", - v32.fail_msg); + v32->fail_msg); } else { - simd32_cfg = v32.cfg; - prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs; - prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used); + const performance &perf = v32->performance_analysis.require(); + + if (!(INTEL_DEBUG & DEBUG_DO32) && throughput >= perf.throughput) { + compiler->shader_perf_log(log_data, "SIMD32 shader inefficient\n"); + } else { + simd32_cfg = v32->cfg; + prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs; + prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used); + throughput = MAX2(throughput, perf.throughput); + } } } @@ -8456,52 +8736,69 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (prog_data->persample_dispatch) { /* Starting with SandyBridge (where we first get MSAA), the different * pixel dispatch combinations are grouped into classifications A - * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On most hardware * generations, the only configurations supporting persample dispatch - * are are this in which only one dispatch width is enabled. + * are those in which only one dispatch width is enabled. + * + * The Gen12 hardware spec has a similar dispatch grouping table, but + * the following conflicting restriction applies (from the page on + * "Structure_3DSTATE_PS_BODY"), so we need to keep the SIMD16 shader: + * + * "SIMD32 may only be enabled if SIMD16 or (dual)SIMD8 is also + * enabled." */ if (simd32_cfg || simd16_cfg) simd8_cfg = NULL; - if (simd32_cfg) + if (simd32_cfg && devinfo->gen < 12) simd16_cfg = NULL; } fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, - v8.shader_stats, v8.runtime_check_aads_emit, - MESA_SHADER_FRAGMENT); + v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", - shader->info.label ? - shader->info.label : "unnamed", - shader->info.name)); + nir->info.label ? + nir->info.label : "unnamed", + nir->info.name)); } if (simd8_cfg) { prog_data->dispatch_8 = true; - g.generate_code(simd8_cfg, 8, stats); + g.generate_code(simd8_cfg, 8, v8->shader_stats, + v8->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; } if (simd16_cfg) { prog_data->dispatch_16 = true; - prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16, stats); + prog_data->prog_offset_16 = g.generate_code( + simd16_cfg, 16, v16->shader_stats, + v16->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; } if (simd32_cfg) { prog_data->dispatch_32 = true; - prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32, stats); + prog_data->prog_offset_32 = g.generate_code( + simd32_cfg, 32, v32->shader_stats, + v32->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; } + g.add_const_data(nir->constant_data, nir->constant_data_size); + + delete v8; + delete v16; + delete v32; + return g.get_assembly(); } fs_reg * fs_visitor::emit_cs_work_group_id_setup() { - assert(stage == MESA_SHADER_COMPUTE); + assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type)); @@ -8516,6 +8813,16 @@ fs_visitor::emit_cs_work_group_id_setup() return reg; } +unsigned +brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, + unsigned threads) +{ + assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0); + assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0); + return cs_prog_data->push.per_thread.size * threads + + cs_prog_data->push.cross_thread.size; +} + static void fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) { @@ -8554,11 +8861,6 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); - unsigned total_dwords = - (cs_prog_data->push.per_thread.size * cs_prog_data->threads + - cs_prog_data->push.cross_thread.size) / 4; - fill_push_const_block_info(&cs_prog_data->push.total, total_dwords); - assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || cs_prog_data->push.per_thread.size == 0); assert(cs_prog_data->push.cross_thread.dwords + @@ -8566,13 +8868,54 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, prog_data->nr_params); } +static bool +filter_simd(const nir_instr *instr, const void *_options) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_simd_width_intel: + case nir_intrinsic_load_subgroup_id: + return true; + + default: + return false; + } +} + +static nir_ssa_def * +lower_simd(nir_builder *b, nir_instr *instr, void *options) +{ + uintptr_t simd_width = (uintptr_t)options; + + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_simd_width_intel: + return nir_imm_int(b, simd_width); + + case nir_intrinsic_load_subgroup_id: + /* If the whole workgroup fits in one thread, we can lower subgroup_id + * to a constant zero. + */ + if (!b->shader->info.cs.local_size_variable) { + unsigned local_workgroup_size = b->shader->info.cs.local_size[0] * + b->shader->info.cs.local_size[1] * + b->shader->info.cs.local_size[2]; + if (local_workgroup_size <= simd_width) + return nir_imm_int(b, 0); + } + return NULL; + + default: + return NULL; + } +} + static void -cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) +brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width) { - cs_prog_data->simd_size = size; - unsigned group_size = cs_prog_data->local_size[0] * - cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; - cs_prog_data->threads = (group_size + size - 1) / size; + nir_shader_lower_instructions(nir, filter_simd, lower_simd, + (void *)(uintptr_t)dispatch_width); } static nir_shader * @@ -8585,7 +8928,7 @@ compile_cs_to_nir(const struct brw_compiler *compiler, nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true); - NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width); + NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width); /* Clean up after the local index and ID calculations. */ NIR_PASS_V(shader, nir_opt_constant_folding); @@ -8601,30 +8944,40 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_cs_prog_key *key, struct brw_cs_prog_data *prog_data, - const nir_shader *src_shader, + const nir_shader *nir, int shader_time_index, struct brw_compile_stats *stats, char **error_str) { - prog_data->base.total_shared = src_shader->info.cs.shared_size; - prog_data->local_size[0] = src_shader->info.cs.local_size[0]; - prog_data->local_size[1] = src_shader->info.cs.local_size[1]; - prog_data->local_size[2] = src_shader->info.cs.local_size[2]; - prog_data->slm_size = src_shader->num_shared; - unsigned local_workgroup_size = - src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * - src_shader->info.cs.local_size[2]; - - unsigned min_dispatch_width = - DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); - min_dispatch_width = MAX2(8, min_dispatch_width); - min_dispatch_width = util_next_power_of_two(min_dispatch_width); - assert(min_dispatch_width <= 32); - unsigned max_dispatch_width = 32; + prog_data->base.total_shared = nir->info.cs.shared_size; + prog_data->slm_size = nir->shared_size; - fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; - fs_visitor *v = NULL; - const char *fail_msg = NULL; + /* Generate code for all the possible SIMD variants. */ + bool generate_all; + + unsigned min_dispatch_width; + unsigned max_dispatch_width; + + if (nir->info.cs.local_size_variable) { + generate_all = true; + min_dispatch_width = 8; + max_dispatch_width = 32; + } else { + generate_all = false; + prog_data->local_size[0] = nir->info.cs.local_size[0]; + prog_data->local_size[1] = nir->info.cs.local_size[1]; + prog_data->local_size[2] = nir->info.cs.local_size[2]; + unsigned local_workgroup_size = prog_data->local_size[0] * + prog_data->local_size[1] * + prog_data->local_size[2]; + + /* Limit max_threads to 64 for the GPGPU_WALKER command */ + const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads); + min_dispatch_width = util_next_power_of_two( + MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads))); + assert(min_dispatch_width <= 32); + max_dispatch_width = 32; + } if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) { /* These enum values are expressly chosen to be equal to the subgroup @@ -8637,70 +8990,98 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, required_dispatch_width == 32); if (required_dispatch_width < min_dispatch_width || required_dispatch_width > max_dispatch_width) { - fail_msg = "Cannot satisfy explicit subgroup size"; - } else { - min_dispatch_width = max_dispatch_width = required_dispatch_width; + if (error_str) { + *error_str = ralloc_strdup(mem_ctx, + "Cannot satisfy explicit subgroup size"); + } + return NULL; } + min_dispatch_width = max_dispatch_width = required_dispatch_width; } - /* Now the main event: Visit the shader IR and generate our CS IR for it. - */ - if (!fail_msg && min_dispatch_width <= 8 && max_dispatch_width >= 8) { + assert(min_dispatch_width <= max_dispatch_width); + + fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; + fs_visitor *v = NULL; + + if (likely(!(INTEL_DEBUG & DEBUG_NO8)) && + min_dispatch_width <= 8 && max_dispatch_width >= 8) { nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key, - src_shader, 8); + nir, 8); v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, nir8, 8, shader_time_index); - if (!v8->run_cs(min_dispatch_width)) { - fail_msg = v8->fail_msg; - } else { - /* We should always be able to do SIMD32 for compute shaders */ - assert(v8->max_dispatch_width >= 32); - - v = v8; - cs_set_simd_size(prog_data, 8); - cs_fill_push_const_info(compiler->devinfo, prog_data); + if (!v8->run_cs(true /* allow_spilling */)) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v8->fail_msg); + delete v8; + return NULL; } + + /* We should always be able to do SIMD32 for compute shaders */ + assert(v8->max_dispatch_width >= 32); + + v = v8; + prog_data->prog_mask |= 1 << 0; + if (v8->spilled_any_registers) + prog_data->prog_spilled |= 1 << 0; + cs_fill_push_const_info(compiler->devinfo, prog_data); } if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && - !fail_msg && min_dispatch_width <= 16 && max_dispatch_width >= 16) { + (generate_all || !prog_data->prog_spilled) && + min_dispatch_width <= 16 && max_dispatch_width >= 16) { /* Try a SIMD16 compile */ nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key, - src_shader, 16); + nir, 16); v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, nir16, 16, shader_time_index); if (v8) v16->import_uniforms(v8); - if (!v16->run_cs(min_dispatch_width)) { + const bool allow_spilling = generate_all || v == NULL; + if (!v16->run_cs(allow_spilling)) { compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", v16->fail_msg); if (!v) { - fail_msg = - "Couldn't generate SIMD16 program and not " - "enough threads for SIMD8"; + assert(v8 == NULL); + if (error_str) { + *error_str = ralloc_asprintf( + mem_ctx, "Not enough threads for SIMD8 and " + "couldn't generate SIMD16: %s", v16->fail_msg); + } + delete v16; + return NULL; } } else { /* We should always be able to do SIMD32 for compute shaders */ assert(v16->max_dispatch_width >= 32); v = v16; - cs_set_simd_size(prog_data, 16); + prog_data->prog_mask |= 1 << 1; + if (v16->spilled_any_registers) + prog_data->prog_spilled |= 1 << 1; cs_fill_push_const_info(compiler->devinfo, prog_data); } } - /* We should always be able to do SIMD32 for compute shaders */ - assert(!v16 || v16->max_dispatch_width >= 32); - - if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32)) && - max_dispatch_width >= 32) { + /* The SIMD32 is only enabled for cases it is needed unless forced. + * + * TODO: Use performance_analysis and drop this boolean. + */ + const bool needs_32 = v == NULL || + (INTEL_DEBUG & DEBUG_DO32) || + generate_all; + + if (likely(!(INTEL_DEBUG & DEBUG_NO32)) && + (generate_all || !prog_data->prog_spilled) && + needs_32 && + min_dispatch_width <= 32 && max_dispatch_width >= 32) { /* Try a SIMD32 compile */ nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key, - src_shader, 32); + nir, 32); v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base, &prog_data->base, nir32, 32, shader_time_index); @@ -8709,44 +9090,91 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, else if (v16) v32->import_uniforms(v16); - if (!v32->run_cs(min_dispatch_width)) { + const bool allow_spilling = generate_all || v == NULL; + if (!v32->run_cs(allow_spilling)) { compiler->shader_perf_log(log_data, "SIMD32 shader failed to compile: %s", v32->fail_msg); if (!v) { - fail_msg = - "Couldn't generate SIMD32 program and not " - "enough threads for SIMD16"; + assert(v8 == NULL); + assert(v16 == NULL); + if (error_str) { + *error_str = ralloc_asprintf( + mem_ctx, "Not enough threads for SIMD16 and " + "couldn't generate SIMD32: %s", v32->fail_msg); + } + delete v32; + return NULL; } } else { v = v32; - cs_set_simd_size(prog_data, 32); + prog_data->prog_mask |= 1 << 2; + if (v32->spilled_any_registers) + prog_data->prog_spilled |= 1 << 2; cs_fill_push_const_info(compiler->devinfo, prog_data); } } - const unsigned *ret = NULL; - if (unlikely(v == NULL)) { - assert(fail_msg); - if (error_str) - *error_str = ralloc_strdup(mem_ctx, fail_msg); - } else { - fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, - v->shader_stats, v->runtime_check_aads_emit, - MESA_SHADER_COMPUTE); - if (INTEL_DEBUG & DEBUG_CS) { - char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", - src_shader->info.label ? - src_shader->info.label : "unnamed", - src_shader->info.name); - g.enable_debug(name); + if (unlikely(!v && (INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32)))) { + if (error_str) { + *error_str = + ralloc_strdup(mem_ctx, + "Cannot satisfy INTEL_DEBUG flags SIMD restrictions"); } + return NULL; + } - g.generate_code(v->cfg, prog_data->simd_size, stats); + assert(v); - ret = g.get_assembly(); + const unsigned *ret = NULL; + + fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, + v->runtime_check_aads_emit, MESA_SHADER_COMPUTE); + if (INTEL_DEBUG & DEBUG_CS) { + char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", + nir->info.label ? + nir->info.label : "unnamed", + nir->info.name); + g.enable_debug(name); + } + + if (generate_all) { + if (prog_data->prog_mask & (1 << 0)) { + assert(v8); + prog_data->prog_offset[0] = + g.generate_code(v8->cfg, 8, v8->shader_stats, + v8->performance_analysis.require(), stats); + stats = stats ? stats + 1 : NULL; + } + + if (prog_data->prog_mask & (1 << 1)) { + assert(v16); + prog_data->prog_offset[1] = + g.generate_code(v16->cfg, 16, v16->shader_stats, + v16->performance_analysis.require(), stats); + stats = stats ? stats + 1 : NULL; + } + + if (prog_data->prog_mask & (1 << 2)) { + assert(v32); + prog_data->prog_offset[2] = + g.generate_code(v32->cfg, 32, v32->shader_stats, + v32->performance_analysis.require(), stats); + stats = stats ? stats + 1 : NULL; + } + } else { + /* Only one dispatch width will be valid, and will be at offset 0, + * which is already the default value of prog_offset_* fields. + */ + prog_data->prog_mask = 1 << (v->dispatch_width / 16); + g.generate_code(v->cfg, v->dispatch_width, v->shader_stats, + v->performance_analysis.require(), stats); } + g.add_const_data(nir->constant_data, nir->constant_data_size); + + ret = g.get_assembly(); + delete v8; delete v16; delete v32; @@ -8754,6 +9182,41 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, return ret; } +unsigned +brw_cs_simd_size_for_group_size(const struct gen_device_info *devinfo, + const struct brw_cs_prog_data *cs_prog_data, + unsigned group_size) +{ + const unsigned mask = cs_prog_data->prog_mask; + assert(mask != 0); + + static const unsigned simd8 = 1 << 0; + static const unsigned simd16 = 1 << 1; + static const unsigned simd32 = 1 << 2; + + if (unlikely(INTEL_DEBUG & DEBUG_DO32) && (mask & simd32)) + return 32; + + /* Limit max_threads to 64 for the GPGPU_WALKER command */ + const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads); + + if ((mask & simd8) && group_size <= 8 * max_threads) { + /* Prefer SIMD16 if can do without spilling. Matches logic in + * brw_compile_cs. + */ + if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16)) + return 16; + return 8; + } + + if ((mask & simd16) && group_size <= 16 * max_threads) + return 16; + + assert(mask & simd32); + assert(group_size <= 32 * max_threads); + return 32; +} + /** * Test the dispatch mask packing assumptions of * brw_stage_has_packed_dispatch(). Call this from e.g. the top of @@ -8783,3 +9246,11 @@ brw_fs_test_dispatch_packing(const fs_builder &bld) set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE)); } } + +unsigned +fs_visitor::workgroup_size() const +{ + assert(stage == MESA_SHADER_COMPUTE); + const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data); + return cs->local_size[0] * cs->local_size[1] * cs->local_size[2]; +}