X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs.cpp;h=c2dd9da5a4923e497c50b091a67effe3a315447e;hb=9eea3df29f21eb7507354c3b1d85d238b671a211;hp=a5d403474f92ffb69e6044380717de454f3c49d2;hpb=ae3543950c93ec4ac179013cb1c7baaf6f5ef4a7;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index a5d403474f9..c2dd9da5a49 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -39,6 +39,7 @@ #include "brw_program.h" #include "brw_dead_control_flow.h" #include "compiler/glsl_types.h" +#include "program/prog_parameter.h" using namespace brw; @@ -174,40 +175,28 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * CSE can later notice that those loads are all the same and eliminate * the redundant ones. */ - fs_reg vec4_offset = vgrf(glsl_type::int_type); + fs_reg vec4_offset = vgrf(glsl_type::uint_type); bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); - int scale = 1; - if (devinfo->gen == 4 && bld.dispatch_width() == 8) { - /* Pre-gen5, we can either use a SIMD8 message that requires (header, - * u, v, r) as parameters, or we can just use the SIMD16 message - * consisting of (header, u). We choose the second, at the cost of a - * longer return length. - */ - scale = 2; - } - - enum opcode op; - if (devinfo->gen >= 7) - op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; - else - op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; - - int regs_written = 4 * (bld.dispatch_width() / 8) * scale; - fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type); - fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset); - inst->regs_written = regs_written; + /* The pull load message will load a vec4 (16 bytes). If we are loading + * a double this means we are only loading 2 elements worth of data. + * We also want to use a 32-bit data type for the dst of the load operation + * so other parts of the driver don't get confused about the size of the + * result. + */ + fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, + vec4_result, surf_index, vec4_offset); + inst->regs_written = 4 * bld.dispatch_width() / 8; - if (devinfo->gen < 7) { - inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen); - inst->header_size = 1; - if (devinfo->gen == 4) - inst->mlen = 3; - else - inst->mlen = 1 + bld.dispatch_width() / 8; + if (type_sz(dst.type) == 8) { + shuffle_32bit_load_result_to_64bit_data( + bld, retype(vec4_result, dst.type), vec4_result, 2); } - bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale)); + vec4_result.type = dst.type; + bld.MOV(dst, offset(vec4_result, bld, + (const_offset & 0xf) / type_sz(vec4_result.type))); } /** @@ -374,7 +363,7 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const if (i < this->header_size) { reg.reg_offset += 1; } else { - reg.reg_offset += this->exec_size / 8; + reg = horiz_offset(reg, this->exec_size); } } @@ -433,7 +422,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) : { this->reg_offset = 0; this->subreg_offset = 0; - this->reladdr = NULL; this->stride = 1; if (this->file == IMM && (this->type != BRW_REGISTER_TYPE_V && @@ -448,7 +436,6 @@ fs_reg::equals(const fs_reg &r) const { return (this->backend_reg::equals(r) && subreg_offset == r.subreg_offset && - !reladdr && !r.reladdr && stride == r.stride); } @@ -487,6 +474,8 @@ type_size_scalar(const struct glsl_type *type) case GLSL_TYPE_FLOAT: case GLSL_TYPE_BOOL: return type->components(); + case GLSL_TYPE_DOUBLE: + return type->components() * 2; case GLSL_TYPE_ARRAY: return type_size_scalar(type->fields.array) * type->length; case GLSL_TYPE_STRUCT: @@ -509,7 +498,7 @@ type_size_scalar(const struct glsl_type *type) case GLSL_TYPE_VOID: case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: - case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_FUNCTION: unreachable("not reached"); } @@ -529,6 +518,19 @@ type_size_vec4_times_4(const struct glsl_type *type) return 4 * type_size_vec4(type); } +/* Attribute arrays are loaded as one vec4 per element (or matrix column), + * except for double-precision types, which are loaded as one dvec4. + */ +extern "C" int +type_size_vs_input(const struct glsl_type *type) +{ + if (type->is_double()) { + return type_size_dvec4(type); + } else { + return type_size_vec4(type); + } +} + /** * Create a MOV to read the timestamp register. * @@ -700,7 +702,8 @@ fs_inst::is_partial_write() const { return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || (this->exec_size * type_sz(this->dst.type)) < 32 || - !this->dst.is_contiguous()); + !this->dst.is_contiguous() || + this->dst.subreg_offset > 0); } unsigned @@ -739,6 +742,7 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_LOD_LOGICAL: case SHADER_OPCODE_TG4_LOGICAL: case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM && src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); /* Texture coordinates. */ @@ -852,7 +856,10 @@ fs_inst::regs_read(int arg) const assert(src[2].file == IMM); unsigned region_length = src[2].ud; - if (src[0].file == FIXED_GRF) { + if (src[0].file == UNIFORM) { + assert(region_length % 4 == 0); + return region_length / 4; + } else if (src[0].file == FIXED_GRF) { /* If the start of the region is not register aligned, then * there's some portion of the register that's technically * unread at the beginning. @@ -866,7 +873,7 @@ fs_inst::regs_read(int arg) const * unread portion at the beginning. */ if (src[0].subnr) - region_length += src[0].subnr * type_sz(src[0].type); + region_length += src[0].subnr; return DIV_ROUND_UP(region_length, REG_SIZE); } else { @@ -947,12 +954,14 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: case SHADER_OPCODE_TXF_CMS: case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: case SHADER_OPCODE_TXS: case SHADER_OPCODE_LOD: case SHADER_OPCODE_SAMPLEINFO: @@ -963,7 +972,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case SHADER_OPCODE_GEN4_SCRATCH_READ: return 1; - case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: return inst->mlen; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: return inst->mlen; @@ -1022,41 +1031,21 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; - this->param_size = v->param_size; } fs_reg * -fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer, - bool origin_upper_left) +fs_visitor::emit_fragcoord_interpolation() { assert(stage == MESA_SHADER_FRAGMENT); - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type)); fs_reg wpos = *reg; - bool flip = !origin_upper_left ^ key->render_to_fbo; /* gl_FragCoord.x */ - if (pixel_center_integer) { - bld.MOV(wpos, this->pixel_x); - } else { - bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f)); - } + bld.MOV(wpos, this->pixel_x); wpos = offset(wpos, bld, 1); /* gl_FragCoord.y */ - if (!flip && pixel_center_integer) { - bld.MOV(wpos, this->pixel_y); - } else { - fs_reg pixel_y = this->pixel_y; - float offset = (pixel_center_integer ? 0.0f : 0.5f); - - if (flip) { - pixel_y.negate = true; - offset += key->drawable_height - 1.0f; - } - - bld.ADD(wpos, pixel_y, brw_imm_f(offset)); - } + bld.MOV(wpos, this->pixel_y); wpos = offset(wpos, bld, 1); /* gl_FragCoord.z */ @@ -1191,8 +1180,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, inst->no_dd_clear = true; inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode, - mod_centroid && !key->persample_shading, - mod_sample || key->persample_shading); + mod_centroid && !key->persample_interp, + mod_sample || key->persample_interp); inst->predicate = BRW_PREDICATE_NORMAL; inst->predicate_inverse = false; if (devinfo->has_pln) @@ -1200,8 +1189,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, } else { emit_linterp(*attr, fs_reg(interp), interpolation_mode, - mod_centroid && !key->persample_shading, - mod_sample || key->persample_shading); + mod_centroid && !key->persample_interp, + mod_sample || key->persample_interp); } if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) { bld.MUL(*attr, *attr, this->pixel_w); @@ -1258,10 +1247,10 @@ void fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) { assert(stage == MESA_SHADER_FRAGMENT); - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; assert(dst.type == BRW_REGISTER_TYPE_F); - if (key->compute_pos_offset) { + if (wm_prog_data->persample_dispatch) { /* Convert int_sample_pos to floating point */ bld.MOV(dst, int_sample_pos); /* Scale to the range [0, 1] */ @@ -1336,7 +1325,48 @@ fs_visitor::emit_sampleid_setup() const fs_builder abld = bld.annotate("compute sample id"); fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); - if (key->compute_sample_id) { + if (!key->multisample_fbo) { + /* As per GL_ARB_sample_shading specification: + * "When rendering to a non-multisample buffer, or if multisample + * rasterization is disabled, gl_SampleID will always be zero." + */ + abld.MOV(*reg, brw_imm_d(0)); + } else if (devinfo->gen >= 8) { + /* Sample ID comes in as 4-bit numbers in g1.0: + * + * 15:12 Slot 3 SampleID (only used in SIMD16) + * 11:8 Slot 2 SampleID (only used in SIMD16) + * 7:4 Slot 1 SampleID + * 3:0 Slot 0 SampleID + * + * Each slot corresponds to four channels, so we want to replicate each + * half-byte value to 4 channels in a row: + * + * dst+0: .7 .6 .5 .4 .3 .2 .1 .0 + * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0 + * + * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16) + * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8 + * + * First, we read g1.0 with a <1,8,0>UB region, causing the first 8 + * channels to read the first byte (7:0), and the second group of 8 + * channels to read the second byte (15:8). Then, we shift right by + * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3 + * values into place. Finally, we AND with 0xf to keep the low nibble. + * + * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V + * and(16) dst<1>D tmp<8,8,1>W 0xf:W + * + * TODO: These payload bits exist on Gen7 too, but they appear to always + * be zero, so this code fails to work. We should find out why. + */ + fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); + + abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0), + BRW_REGISTER_TYPE_B), 1, 8, 0)), + brw_imm_v(0x44440000)); + abld.AND(*reg, tmp, brw_imm_w(0xf)); + } else { fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D); t1.set_smear(0); fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); @@ -1364,32 +1394,63 @@ fs_visitor::emit_sampleid_setup() /* SKL+ has an extra bit for the Starting Sample Pair Index to * accomodate 16x MSAA. */ - unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0; - abld.exec_all().group(1, 0) .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), - brw_imm_ud(sspi_mask)); + brw_imm_ud(0xc0)); abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5)); /* This works for both SIMD8 and SIMD16 */ - abld.exec_all().group(4, 0) - .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)); + abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210)); /* This special instruction takes care of setting vstride=1, * width=4, hstride=0 of t2 during an ADD instruction. */ abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2); - } else { - /* As per GL_ARB_sample_shading specification: - * "When rendering to a non-multisample buffer, or if multisample - * rasterization is disabled, gl_SampleID will always be zero." - */ - abld.MOV(*reg, brw_imm_d(0)); } return reg; } +fs_reg * +fs_visitor::emit_samplemaskin_setup() +{ + assert(stage == MESA_SHADER_FRAGMENT); + brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; + assert(devinfo->gen >= 6); + + fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); + + fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0), + BRW_REGISTER_TYPE_D)); + + if (wm_prog_data->persample_dispatch) { + /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, + * and a mask representing which sample is being processed by the + * current shader invocation. + * + * From the OES_sample_variables specification: + * "When per-sample shading is active due to the use of a fragment input + * qualified by "sample" or due to the use of the gl_SampleID or + * gl_SamplePosition variables, only the bit for the current sample is + * set in gl_SampleMaskIn." + */ + const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); + + if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) + nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); + + fs_reg one = vgrf(glsl_type::int_type); + fs_reg enabled_mask = vgrf(glsl_type::int_type); + abld.MOV(one, brw_imm_d(1)); + abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]); + abld.AND(*reg, enabled_mask, coverage_mask); + } else { + /* In per-pixel mode, the coverage mask is sufficient. */ + *reg = coverage_mask; + } + return reg; +} + fs_reg fs_visitor::resolve_source_modifiers(const fs_reg &src) { @@ -1473,20 +1534,6 @@ fs_visitor::emit_gs_thread_end() void fs_visitor::assign_curb_setup() { - if (dispatch_width == 8) { - prog_data->dispatch_grf_start_reg = payload.num_regs; - } else { - if (stage == MESA_SHADER_FRAGMENT) { - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; - prog_data->dispatch_grf_start_reg_16 = payload.num_regs; - } else if (stage == MESA_SHADER_COMPUTE) { - brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; - prog_data->dispatch_grf_start_reg_16 = payload.num_regs; - } else { - unreachable("Unsupported shader type!"); - } - } - prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8; /* Map the offsets in the UNIFORM file to fixed HW regs. */ @@ -1652,11 +1699,28 @@ fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst) inst->src[i].nr + inst->src[i].reg_offset; - unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size; + /* As explained at brw_reg_from_fs_reg, From the Haswell PRM: + * + * VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries. + * + * So, for registers that are large enough, we have to split the exec + * size in two and trust the compression state to sort it out. + */ + unsigned total_size = inst->exec_size * + inst->src[i].stride * + type_sz(inst->src[i].type); + + assert(total_size <= 2 * REG_SIZE); + const unsigned exec_size = + (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2; + + unsigned width = inst->src[i].stride == 0 ? 1 : exec_size; struct brw_reg reg = stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), inst->src[i].subreg_offset), - inst->exec_size * inst->src[i].stride, + exec_size * inst->src[i].stride, width, inst->src[i].stride); reg.abs = inst->src[i].abs; reg.negate = inst->src[i].negate; @@ -1674,7 +1738,7 @@ fs_visitor::assign_vs_urb_setup() assert(stage == MESA_SHADER_VERTEX); /* Each attribute is 4 regs. */ - this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes; + this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots; assert(vs_prog_data->base.urb_read_length <= 15); @@ -1684,6 +1748,17 @@ fs_visitor::assign_vs_urb_setup() } } +void +fs_visitor::assign_tcs_single_patch_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_CTRL); + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + void fs_visitor::assign_tes_urb_setup() { @@ -1918,6 +1993,45 @@ fs_visitor::compact_virtual_grfs() return progress; } +static void +set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous, + int *push_constant_loc, int *pull_constant_loc, + unsigned *num_push_constants, + unsigned *num_pull_constants, + const unsigned max_push_components, + const unsigned max_chunk_size, + struct brw_stage_prog_data *stage_prog_data) +{ + /* This is the first live uniform in the chunk */ + if (*chunk_start < 0) + *chunk_start = uniform; + + /* If this element does not need to be contiguous with the next, we + * split at this point and everything between chunk_start and u forms a + * single chunk. + */ + if (!contiguous) { + unsigned chunk_size = uniform - *chunk_start + 1; + + /* Decide whether we should push or pull this parameter. In the + * Vulkan driver, push constants are explicitly exposed via the API + * so we push everything. In GL, we only push small arrays. + */ + if (stage_prog_data->pull_param == NULL || + (*num_push_constants + chunk_size <= max_push_components && + chunk_size <= max_chunk_size)) { + assert(*num_push_constants + chunk_size <= max_push_components); + for (unsigned j = *chunk_start; j <= uniform; j++) + push_constant_loc[j] = (*num_push_constants)++; + } else { + for (unsigned j = *chunk_start; j <= uniform; j++) + pull_constant_loc[j] = (*num_pull_constants)++; + } + + *chunk_start = -1; + } +} + /** * Assign UNIFORM file registers to either push constants or pull constants. * @@ -1925,31 +2039,32 @@ fs_visitor::compact_virtual_grfs() * maximum number of fragment shader uniform components (64). If * there are too many of these, they'd fill up all of register space. * So, this will push some of them out to the pull constant buffer and - * update the program to load them. We also use pull constants for all - * indirect constant loads because we don't support indirect accesses in - * registers yet. + * update the program to load them. */ void fs_visitor::assign_constant_locations() { - /* Only the first compile (SIMD8 mode) gets to decide on locations. */ - if (dispatch_width != 8) + /* Only the first compile gets to decide on locations. */ + if (dispatch_width != min_dispatch_width) return; - unsigned int num_pull_constants = 0; - - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms); - bool is_live[uniforms]; memset(is_live, 0, sizeof(is_live)); + bool is_live_64bit[uniforms]; + memset(is_live_64bit, 0, sizeof(is_live_64bit)); + + /* For each uniform slot, a value of true indicates that the given slot and + * the next slot must remain contiguous. This is used to keep us from + * splitting arrays apart. + */ + bool contiguous[uniforms]; + memset(contiguous, 0, sizeof(contiguous)); /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. * - * 2) Find all indirect access of uniform arrays and flag them as needing - * to go into the pull constant buffer. + * 2) Mark any indirectly used ranges of registers as contiguous. * * Note that we don't move constant-indexed accesses to arrays. No * testing has been done of the performance impact of this choice. @@ -1959,22 +2074,32 @@ fs_visitor::assign_constant_locations() if (inst->src[i].file != UNIFORM) continue; - if (inst->src[i].reladdr) { - int uniform = inst->src[i].nr; + int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; - /* If this array isn't already present in the pull constant buffer, - * add it. - */ - if (pull_constant_loc[uniform] == -1) { - assert(param_size[uniform]); - for (int j = 0; j < param_size[uniform]; j++) - pull_constant_loc[uniform + j] = num_pull_constants++; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + assert(inst->src[2].ud % 4 == 0); + unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; + assert(last < uniforms); + + for (unsigned j = constant_nr; j < last; j++) { + is_live[j] = true; + contiguous[j] = true; + if (type_sz(inst->src[i].type) == 8) { + is_live_64bit[j] = true; + } } + is_live[last] = true; } else { - /* Mark the the one accessed uniform as live */ - int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; - if (constant_nr >= 0 && constant_nr < (int) uniforms) - is_live[constant_nr] = true; + if (constant_nr >= 0 && constant_nr < (int) uniforms) { + int regs_read = inst->components_read(i) * + type_sz(inst->src[i].type) / 4; + for (int j = 0; j < regs_read; j++) { + is_live[constant_nr + j] = true; + if (type_sz(inst->src[i].type) == 8) { + is_live_64bit[constant_nr + j] = true; + } + } + } } } } @@ -1987,32 +2112,58 @@ fs_visitor::assign_constant_locations() * If changing this value, note the limitation about total_regs in * brw_curbe.c. */ - unsigned int max_push_components = 16 * 8; + const unsigned int max_push_components = 16 * 8; + + /* We push small arrays, but no bigger than 16 floats. This is big enough + * for a vec4 but hopefully not large enough to push out other stuff. We + * should probably use a better heuristic at some point. + */ + const unsigned int max_chunk_size = 16; + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - for (unsigned int i = 0; i < uniforms; i++) { - if (!is_live[i] || pull_constant_loc[i] != -1) { - /* This UNIFORM register is either dead, or has already been demoted - * to a pull const. Mark it as no longer living in the param[] array. - */ - push_constant_loc[i] = -1; + /* Default to -1 meaning no location */ + memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); + memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); + + int chunk_start = -1; + + /* First push 64-bit uniforms to ensure they are properly aligned */ + for (unsigned u = 0; u < uniforms; u++) { + if (!is_live[u] || !is_live_64bit[u]) continue; - } - if (num_push_constants < max_push_components) { - /* Retain as a push constant. Record the location in the params[] - * array. - */ - push_constant_loc[i] = num_push_constants++; - } else { - /* Demote to a pull constant. */ - push_constant_loc[i] = -1; - pull_constant_loc[i] = num_pull_constants++; - } + set_push_pull_constant_loc(u, &chunk_start, contiguous[u], + push_constant_loc, pull_constant_loc, + &num_push_constants, &num_pull_constants, + max_push_components, max_chunk_size, + stage_prog_data); + + } + + /* Then push the rest of uniforms */ + for (unsigned u = 0; u < uniforms; u++) { + if (!is_live[u] || is_live_64bit[u]) + continue; + + set_push_pull_constant_loc(u, &chunk_start, contiguous[u], + push_constant_loc, pull_constant_loc, + &num_push_constants, &num_pull_constants, + max_push_components, max_chunk_size, + stage_prog_data); } + /* As the uniforms are going to be reordered, take the data from a temporary + * copy of the original param[]. + */ + gl_constant_value **param = ralloc_array(NULL, gl_constant_value*, + stage_prog_data->nr_params); + memcpy(param, stage_prog_data->param, + sizeof(gl_constant_value*) * stage_prog_data->nr_params); stage_prog_data->nr_params = num_push_constants; stage_prog_data->nr_pull_params = num_pull_constants; @@ -2025,7 +2176,7 @@ fs_visitor::assign_constant_locations() * having to make a copy. */ for (unsigned int i = 0; i < uniforms; i++) { - const gl_constant_value *value = stage_prog_data->param[i]; + const gl_constant_value *value = param[i]; if (pull_constant_loc[i] != -1) { stage_prog_data->pull_param[pull_constant_loc[i]] = value; @@ -2033,6 +2184,7 @@ fs_visitor::assign_constant_locations() stage_prog_data->param[push_constant_loc[i]] = value; } } + ralloc_free(param); } /** @@ -2040,51 +2192,75 @@ fs_visitor::assign_constant_locations() * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. */ void -fs_visitor::demote_pull_constants() +fs_visitor::lower_constant_loads() { - foreach_block_and_inst (block, fs_inst, inst, cfg) { + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(this, block, inst); + for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != UNIFORM) continue; - int pull_index; + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + unsigned location = inst->src[i].nr + inst->src[i].reg_offset; - if (location >= uniforms) /* Out of bounds access */ - pull_index = -1; - else - pull_index = pull_constant_loc[location]; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; if (pull_index == -1) continue; - /* Set up the annotation tracking for new generated instructions. */ - const fs_builder ibld(this, block, inst); const unsigned index = stage_prog_data->binding_table.pull_constants_start; - fs_reg dst = vgrf(glsl_type::float_type); + fs_reg dst; + + if (type_sz(inst->src[i].type) <= 4) + dst = vgrf(glsl_type::float_type); + else + dst = vgrf(glsl_type::double_type); assert(inst->src[i].stride == 0); - /* Generate a pull load into dst. */ - if (inst->src[i].reladdr) { - VARYING_PULL_CONSTANT_LOAD(ibld, dst, - brw_imm_ud(index), - *inst->src[i].reladdr, - pull_index * 4); - inst->src[i].reladdr = NULL; - inst->src[i].stride = 1; - } else { - const fs_builder ubld = ibld.exec_all().group(8, 0); - struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), offset); - inst->src[i].set_smear(pull_index & 3); - } - brw_mark_surface_used(prog_data, index); + const fs_builder ubld = ibld.exec_all().group(8, 0); + struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + dst, brw_imm_ud(index), offset); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; inst->src[i].reg_offset = 0; + inst->src[i].set_smear((pull_index & 3) * 4 / + type_sz(inst->src[i].type)); + + brw_mark_surface_used(prog_data, index); + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + unsigned location = inst->src[0].nr + inst->src[0].reg_offset; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; + + if (pull_index == -1) + continue; + + VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + inst->src[1], + pull_index * 4); + inst->remove(block); + + brw_mark_surface_used(prog_data, index); } } invalidate_live_intervals(); @@ -2273,17 +2449,6 @@ fs_visitor::opt_algebraic() progress = true; } break; - case SHADER_OPCODE_RCP: { - fs_inst *prev = (fs_inst *)inst->prev; - if (prev->opcode == SHADER_OPCODE_SQRT) { - if (inst->src[0].equals(prev->dst)) { - inst->opcode = SHADER_OPCODE_RSQ; - inst->src[0] = prev->src[0]; - progress = true; - } - } - break; - } case SHADER_OPCODE_BROADCAST: if (is_uniform(inst->src[0])) { inst->opcode = BRW_OPCODE_MOV; @@ -2432,6 +2597,7 @@ fs_visitor::opt_sampler_eot() tex_inst->offset |= fb_write->target << 24; tex_inst->eot = true; tex_inst->dst = ibld.null_reg_ud(); + tex_inst->regs_written = 0; fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); /* If a header is present, marking the eot is sufficient. Otherwise, we need @@ -2440,8 +2606,10 @@ fs_visitor::opt_sampler_eot() * we have enough space, but it will make sure the dead code eliminator kills * the instruction that this will replace. */ - if (tex_inst->header_size != 0) + if (tex_inst->header_size != 0) { + invalidate_live_intervals(); return true; + } fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F, load_payload->sources + 1); @@ -2472,6 +2640,7 @@ fs_visitor::opt_sampler_eot() tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload); tex_inst->src[0] = send_header; + invalidate_live_intervals(); return true; } @@ -2792,12 +2961,23 @@ void fs_visitor::emit_repclear_shader() { brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - int base_mrf = 1; + int base_mrf = 0; int color_mrf = base_mrf + 2; + fs_inst *mov; - fs_inst *mov = bld.exec_all().group(4, 0) - .MOV(brw_message_reg(color_mrf), - fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + if (uniforms > 0) { + mov = bld.exec_all().group(4, 0) + .MOV(brw_message_reg(color_mrf), + fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + } else { + struct brw_reg reg = + brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4, + BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + + mov = bld.exec_all().group(4, 0) + .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)); + } fs_inst *write; if (key->nr_color_regions == 1) { @@ -2826,8 +3006,10 @@ fs_visitor::emit_repclear_shader() assign_curb_setup(); /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ - assert(mov->src[0].file == FIXED_GRF); - mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + if (uniforms > 0) { + assert(mov->src[0].file == FIXED_GRF); + mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + } } /** @@ -3475,6 +3657,36 @@ fs_visitor::lower_integer_multiplication() return progress; } +bool +fs_visitor::lower_minmax() +{ + assert(devinfo->gen < 6); + + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == BRW_OPCODE_SEL && + inst->predicate == BRW_PREDICATE_NONE) { + /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of + * the original SEL.L/GE instruction + */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -3885,6 +4097,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET || offset_value.file != BAD_FILE || + op == SHADER_OPCODE_SAMPLEINFO || is_high_sampler(devinfo, sampler)) { /* For general texture offsets (no txf workaround), we need a header to * put them in. Note that we're only reserving space for it in the @@ -3899,6 +4112,16 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, header_size = 1; sources[0] = fs_reg(); length++; + + /* If we're requesting fewer than four channels worth of response, + * and we have an explicit header, we need to set up the sampler + * writemask. It's reversed from normal: 1 means "don't write". + */ + if (inst->regs_written != 4 * reg_width) { + assert((inst->regs_written % reg_width) == 0); + unsigned mask = ~((1 << (inst->regs_written / reg_width)) - 1) & 0xf; + inst->offset |= mask << 12; + } } if (shadow_c.file != BAD_FILE) { @@ -3922,6 +4145,10 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, switch (op) { case FS_OPCODE_TXB: case SHADER_OPCODE_TXL: + if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) { + op = SHADER_OPCODE_TXL_LZ; + break; + } bld.MOV(sources[length], lod); length++; break; @@ -3973,8 +4200,12 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, length++; } - bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod); - length++; + if (devinfo->gen >= 9 && lod.is_zero()) { + op = SHADER_OPCODE_TXF_LZ; + } else { + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod); + length++; + } for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) { bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate); @@ -3984,6 +4215,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, coordinate_done = true; break; + case SHADER_OPCODE_TXF_CMS: case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_UMS: @@ -4182,6 +4414,58 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op, delete[] components; } +static void +lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) +{ + const brw_device_info *devinfo = bld.shader->devinfo; + + if (devinfo->gen >= 7) { + inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; + + } else { + const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen), + BRW_REGISTER_TYPE_UD); + + bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]); + + inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4; + inst->resize_sources(1); + inst->base_mrf = payload.nr; + inst->header_size = 1; + inst->mlen = 1 + inst->exec_size / 8; + } +} + +static void +lower_math_logical_send(const fs_builder &bld, fs_inst *inst) +{ + assert(bld.shader->devinfo->gen < 6); + + inst->base_mrf = 2; + inst->mlen = inst->sources * inst->exec_size / 8; + + if (inst->sources > 1) { + /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 + * "Message Payload": + * + * "Operand0[7]. For the INT DIV functions, this operand is the + * denominator." + * ... + * "Operand1[7]. For the INT DIV functions, this operand is the + * numerator." + */ + const bool is_int_div = inst->opcode != SHADER_OPCODE_POW; + const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0]; + const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1]; + + inst->resize_sources(1); + inst->src[0] = src0; + + assert(inst->exec_size == 8); + bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1); + } +} + bool fs_visitor::lower_logical_sends() { @@ -4251,6 +4535,10 @@ fs_visitor::lower_logical_sends() lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET); break; + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO); + break; + case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: lower_surface_logical_send(ibld, inst, SHADER_OPCODE_UNTYPED_SURFACE_READ, @@ -4287,6 +4575,35 @@ fs_visitor::lower_logical_sends() ibld.sample_mask_reg()); break; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + lower_varying_pull_constant_logical_send(ibld, inst); + break; + + case SHADER_OPCODE_RCP: + case SHADER_OPCODE_RSQ: + case SHADER_OPCODE_SQRT: + case SHADER_OPCODE_EXP2: + case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + case SHADER_OPCODE_POW: + case SHADER_OPCODE_INT_QUOTIENT: + case SHADER_OPCODE_INT_REMAINDER: + /* The math opcodes are overloaded for the send-like and + * expression-like instructions which seems kind of icky. Gen6+ has + * a native (but rather quirky) MATH instruction so we don't need to + * do anything here. On Gen4-5 we'll have to lower the Gen6-like + * logical instructions (which we can easily recognize because they + * have mlen = 0) into send-like virtual instructions. + */ + if (devinfo->gen < 6 && inst->mlen == 0) { + lower_math_logical_send(ibld, inst); + break; + + } else { + continue; + } + default: continue; } @@ -4300,6 +4617,108 @@ fs_visitor::lower_logical_sends() return progress; } +/** + * Get the closest allowed SIMD width for instruction \p inst accounting for + * some common regioning and execution control restrictions that apply to FPU + * instructions. These restrictions don't necessarily have any relevance to + * instructions not executed by the FPU pipeline like extended math, control + * flow or send message instructions. + * + * For virtual opcodes it's really up to the instruction -- In some cases + * (e.g. where a virtual instruction unrolls into a simple sequence of FPU + * instructions) it may simplify virtual instruction lowering if we can + * enforce FPU-like regioning restrictions already on the virtual instruction, + * in other cases (e.g. virtual send-like instructions) this may be + * excessively restrictive. + */ +static unsigned +get_fpu_lowered_simd_width(const struct brw_device_info *devinfo, + const fs_inst *inst) +{ + /* Maximum execution size representable in the instruction controls. */ + unsigned max_width = MIN2(32, inst->exec_size); + + /* According to the PRMs: + * "A. In Direct Addressing mode, a source cannot span more than 2 + * adjacent GRF registers. + * B. A destination cannot span more than 2 adjacent GRF registers." + * + * Look for the source or destination with the largest register region + * which is the one that is going to limit the overall execution size of + * the instruction due to this rule. + */ + unsigned reg_count = inst->regs_written; + + for (unsigned i = 0; i < inst->sources; i++) + reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i)); + + /* Calculate the maximum execution size of the instruction based on the + * factor by which it goes over the hardware limit of 2 GRFs. + */ + if (reg_count > 2) + max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2)); + + /* According to the IVB PRMs: + * "When destination spans two registers, the source MUST span two + * registers. The exception to the above rule: + * + * - When source is scalar, the source registers are not incremented. + * - When source is packed integer Word and destination is packed + * integer DWord, the source register is not incremented but the + * source sub register is incremented." + * + * The hardware specs from Gen4 to Gen7.5 mention similar regioning + * restrictions. The code below intentionally doesn't check whether the + * destination type is integer because empirically the hardware doesn't + * seem to care what the actual type is as long as it's dword-aligned. + */ + if (devinfo->gen < 8) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->regs_written == 2 && + inst->regs_read(i) != 0 && inst->regs_read(i) != 2 && + !is_uniform(inst->src[i]) && + !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && + type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) + max_width = MIN2(max_width, inst->exec_size / + inst->regs_written); + } + } + + /* From the IVB PRMs: + * "When an instruction is SIMD32, the low 16 bits of the execution mask + * are applied for both halves of the SIMD32 instruction. If different + * execution mask channels are required, split the instruction into two + * SIMD16 instructions." + * + * There is similar text in the HSW PRMs. Gen4-6 don't even implement + * 32-wide control flow support in hardware and will behave similarly. + */ + if (devinfo->gen < 8 && !inst->force_writemask_all) + max_width = MIN2(max_width, 16); + + /* From the IVB PRMs (applies to HSW too): + * "Instructions with condition modifiers must not use SIMD32." + * + * From the BDW PRMs (applies to later hardware too): + * "Ternary instruction with condition modifiers must not use SIMD32." + */ + if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo))) + max_width = MIN2(max_width, 16); + + /* From the IVB PRMs (applies to other devices that don't have the + * brw_device_info::supports_simd16_3src flag set): + * "In Align16 access mode, SIMD16 is not allowed for DW operations and + * SIMD8 is not allowed for DF operations." + */ + if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src) + max_width = MIN2(max_width, inst->exec_size / reg_count); + + /* Only power-of-two execution sizes are representable in the instruction + * control fields. + */ + return 1 << _mesa_logbase2(max_width); +} + /** * Get the closest native SIMD width supported by the hardware for instruction * \p inst. The instruction will be left untouched by @@ -4320,15 +4739,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case BRW_OPCODE_SHR: case BRW_OPCODE_SHL: case BRW_OPCODE_ASR: - case BRW_OPCODE_CMP: case BRW_OPCODE_CMPN: case BRW_OPCODE_CSEL: case BRW_OPCODE_F32TO16: case BRW_OPCODE_F16TO32: case BRW_OPCODE_BFREV: case BRW_OPCODE_BFE: - case BRW_OPCODE_BFI1: - case BRW_OPCODE_BFI2: case BRW_OPCODE_ADD: case BRW_OPCODE_MUL: case BRW_OPCODE_AVG: @@ -4344,40 +4760,106 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case BRW_OPCODE_SAD2: case BRW_OPCODE_MAD: case BRW_OPCODE_LRP: + case FS_OPCODE_PACK: + return get_fpu_lowered_simd_width(devinfo, inst); + + case BRW_OPCODE_CMP: { + /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that + * when the destination is a GRF the dependency-clear bit on the flag + * register is cleared early. + * + * Suggested workarounds are to disable coissuing CMP instructions + * or to split CMP(16) instructions into two CMP(8) instructions. + * + * We choose to split into CMP(8) instructions since disabling + * coissuing would affect CMP instructions not otherwise affected by + * the errata. + */ + const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell && + !inst->dst.is_null() ? 8 : ~0); + return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst)); + } + case BRW_OPCODE_BFI1: + case BRW_OPCODE_BFI2: + /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we + * should + * "Force BFI instructions to be executed always in SIMD8." + */ + return MIN2(devinfo->is_haswell ? 8 : ~0u, + get_fpu_lowered_simd_width(devinfo, inst)); + + case BRW_OPCODE_IF: + assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16); + return inst->exec_size; + case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: case SHADER_OPCODE_SQRT: case SHADER_OPCODE_EXP2: case SHADER_OPCODE_LOG2: + case SHADER_OPCODE_SIN: + case SHADER_OPCODE_COS: + /* Unary extended math instructions are limited to SIMD8 on Gen4 and + * Gen6. + */ + return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : + devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) : + MIN2(8, inst->exec_size)); + case SHADER_OPCODE_POW: + /* SIMD16 is only allowed on Gen7+. */ + return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) : + MIN2(8, inst->exec_size)); + case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: - case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: { - /* According to the PRMs: - * "A. In Direct Addressing mode, a source cannot span more than 2 - * adjacent GRF registers. - * B. A destination cannot span more than 2 adjacent GRF registers." - * - * Look for the source or destination with the largest register region - * which is the one that is going to limit the overal execution size of - * the instruction due to this rule. + /* Integer division is limited to SIMD8 on all generations. */ + return MIN2(8, inst->exec_size); + + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: + /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch + * message used to implement varying pull constant loads, so expand it + * to SIMD16. An alternative with longer message payload length but + * shorter return payload would be to use the SIMD8 sampler message that + * takes (header, u, v, r) as parameters instead of (header, u). */ - unsigned reg_count = inst->regs_written; - - for (unsigned i = 0; i < inst->sources; i++) - reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i)); + return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size)); - /* Calculate the maximum execution size of the instruction based on the - * factor by which it goes over the hardware limit of 2 GRFs. + case FS_OPCODE_DDY_FINE: + /* The implementation of this virtual opcode may require emitting + * compressed Align16 instructions, which are severely limited on some + * generations. + * + * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register + * Region Restrictions): + * + * "In Align16 access mode, SIMD16 is not allowed for DW operations + * and SIMD8 is not allowed for DF operations." + * + * In this context, "DW operations" means "operations acting on 32-bit + * values", so it includes operations on floats. + * + * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3 + * (Instruction Compression -> Rules and Restrictions): + * + * "A compressed instruction must be in Align1 access mode. Align16 + * mode instructions cannot be compressed." + * + * Similar text exists in the g45 PRM. + * + * Empirically, compressed align16 instructions using odd register + * numbers don't appear to work on Sandybridge either. */ - return inst->exec_size / DIV_ROUND_UP(reg_count, 2); - } + return (devinfo->gen == 4 || devinfo->gen == 6 || + (devinfo->gen == 7 && !devinfo->is_haswell) ? + MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size)); + case SHADER_OPCODE_MULH: /* MULH is lowered to the MUL/MACH sequence using the accumulator, which * is 8-wide on Gen7+. */ - return (devinfo->gen >= 7 ? 8 : inst->exec_size); + return (devinfo->gen >= 7 ? 8 : + get_fpu_lowered_simd_width(devinfo, inst)); case FS_OPCODE_FB_WRITE_LOGICAL: /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them @@ -4390,6 +4872,9 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ? 8 : inst->exec_size); + case SHADER_OPCODE_SAMPLEINFO_LOGICAL: + return MIN2(16, inst->exec_size); + case SHADER_OPCODE_TXD_LOGICAL: /* TXD is unsupported in SIMD16 mode. */ return 8; @@ -4444,34 +4929,17 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: return 8; + case SHADER_OPCODE_MOV_INDIRECT: + /* Prior to Broadwell, we only have 8 address subregisters */ + return MIN3(devinfo->gen >= 8 ? 16 : 8, + 2 * REG_SIZE / (inst->dst.stride * type_sz(inst->dst.type)), + inst->exec_size); + default: return inst->exec_size; } } -/** - * The \p rows array of registers represents a \p num_rows by \p num_columns - * matrix in row-major order, write it in column-major order into the register - * passed as destination. \p stride gives the separation between matrix - * elements in the input in fs_builder::dispatch_width() units. - */ -static void -emit_transpose(const fs_builder &bld, - const fs_reg &dst, const fs_reg *rows, - unsigned num_rows, unsigned num_columns, unsigned stride) -{ - fs_reg *const components = new fs_reg[num_rows * num_columns]; - - for (unsigned i = 0; i < num_columns; ++i) { - for (unsigned j = 0; j < num_rows; ++j) - components[num_rows * i + j] = offset(rows[j], bld, stride * i); - } - - bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0); - - delete[] components; -} - bool fs_visitor::lower_simd_width() { @@ -4517,21 +4985,24 @@ fs_visitor::lower_simd_width() * instruction. */ const fs_builder lbld = ibld.group(lower_width, i); + const fs_builder cbld = lbld.group(copy_width, 0); for (unsigned j = 0; j < inst->sources; j++) { if (inst->src[j].file != BAD_FILE && - !is_uniform(inst->src[j])) { + !is_periodic(inst->src[j], lower_width)) { /* Get the i-th copy_width-wide chunk of the source. */ - const fs_reg src = horiz_offset(inst->src[j], copy_width * i); + const fs_reg src = offset(inst->src[j], cbld, i); const unsigned src_size = inst->components_read(j); - /* Use a trivial transposition to copy one every n - * copy_width-wide components of the register into a - * temporary passed as source to the lowered instruction. + /* Copy one every n copy_width-wide components of the + * register into a temporary passed as source to the lowered + * instruction. */ split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size); - emit_transpose(lbld.group(copy_width, 0), - split_inst.src[j], &src, 1, src_size, n); + + for (unsigned k = 0; k < src_size; ++k) + cbld.MOV(offset(split_inst.src[j], lbld, k), + offset(src, cbld, n * k)); } } @@ -4542,28 +5013,36 @@ fs_visitor::lower_simd_width() split_inst.dst = dsts[i] = lbld.vgrf(inst->dst.type, dst_size); split_inst.regs_written = - DIV_ROUND_UP(inst->regs_written * lower_width, - inst->exec_size); + DIV_ROUND_UP(type_sz(inst->dst.type) * dst_size * lower_width, + REG_SIZE); + + if (inst->predicate) { + /* Handle predication by copying the original contents of + * the destination into the temporary before emitting the + * lowered instruction. + */ + for (unsigned k = 0; k < dst_size; ++k) + cbld.MOV(offset(split_inst.dst, lbld, k), + offset(inst->dst, cbld, n * k + i)); + } } lbld.emit(split_inst); } if (inst->regs_written) { - /* Distance between useful channels in the temporaries, skipping - * garbage if the lowered instruction is wider than the original. - */ - const unsigned m = lower_width / copy_width; + const fs_builder lbld = ibld.group(lower_width, 0); /* Interleave the components of the result from the lowered - * instructions. We need to set exec_all() when copying more than - * one half per component, because LOAD_PAYLOAD (in terms of which - * emit_transpose is implemented) can only use the same channel - * enable signals for all of its non-header sources. + * instructions. */ - emit_transpose(ibld.exec_all(inst->exec_size > copy_width) - .group(copy_width, 0), - inst->dst, dsts, n, dst_size, m); + for (unsigned i = 0; i < dst_size; ++i) { + for (unsigned j = 0; j < n; ++j) { + const fs_builder cbld = ibld.group(copy_width, j); + cbld.MOV(offset(inst->dst, cbld, n * i + j), + offset(dsts[j], lbld, i)); + } + } } inst->remove(block); @@ -4633,7 +5112,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) inst->flag_subreg); } - fprintf(file, "%s", brw_instruction_name(inst->opcode)); + fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode)); if (inst->saturate) fprintf(file, ".sat"); if (inst->conditional_mod) { @@ -4726,9 +5205,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) break; case UNIFORM: fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset); - if (inst->src[i].reladdr) { - fprintf(file, "+reladdr"); - } else if (inst->src[i].subreg_offset) { + if (inst->src[i].subreg_offset) { fprintf(file, "+%d.%d", inst->src[i].reg_offset, inst->src[i].subreg_offset); } @@ -4741,6 +5218,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) case BRW_REGISTER_TYPE_F: fprintf(file, "%-gf", inst->src[i].f); break; + case BRW_REGISTER_TYPE_DF: + fprintf(file, "%fdf", inst->src[i].df); + break; case BRW_REGISTER_TYPE_W: case BRW_REGISTER_TYPE_D: fprintf(file, "%dd", inst->src[i].d); @@ -4839,7 +5319,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, { if (end == start || end->is_partial_write() || - reg.reladdr || !reg.equals(end->dst)) { return NULL; } else { @@ -4852,10 +5331,7 @@ fs_visitor::setup_fs_payload_gen6() { assert(stage == MESA_SHADER_FRAGMENT); brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - bool uses_depth = - (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; unsigned barycentric_interp_modes = (stage == MESA_SHADER_FRAGMENT) ? ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0; @@ -4884,7 +5360,9 @@ fs_visitor::setup_fs_payload_gen6() } /* R27: interpolated depth if uses source depth */ - if (uses_depth) { + prog_data->uses_src_depth = + (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (prog_data->uses_src_depth) { payload.source_depth_reg = payload.num_regs; payload.num_regs++; if (dispatch_width == 16) { @@ -4892,8 +5370,11 @@ fs_visitor::setup_fs_payload_gen6() payload.num_regs++; } } + /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */ - if (uses_depth) { + prog_data->uses_src_w = + (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (prog_data->uses_src_w) { payload.source_w_reg = payload.num_regs; payload.num_regs++; if (dispatch_width == 16) { @@ -4902,15 +5383,27 @@ fs_visitor::setup_fs_payload_gen6() } } - prog_data->uses_pos_offset = key->compute_pos_offset; /* R31: MSAA position offsets. */ - if (prog_data->uses_pos_offset) { + if (prog_data->persample_dispatch && + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) { + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = true; payload.sample_pos_reg = payload.num_regs; payload.num_regs++; } /* R32: MSAA input coverage mask */ - if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) { + prog_data->uses_sample_mask = + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0; + if (prog_data->uses_sample_mask) { assert(devinfo->gen >= 7); payload.sample_mask_in_reg = payload.num_regs; payload.num_regs++; @@ -4978,8 +5471,8 @@ fs_visitor::setup_gs_payload() payload.num_regs++; } - /* Use a maximum of 32 registers for push-model inputs. */ - const unsigned max_push_components = 32; + /* Use a maximum of 24 registers for push-model inputs. */ + const unsigned max_push_components = 24; /* If pushing our inputs would take too many registers, reduce the URB read * length (which is in HWords, or 8 registers), and resort to pulling. @@ -5032,6 +5525,44 @@ fs_visitor::calculate_register_pressure() } } +/** + * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones. + * + * The needs_unlit_centroid_workaround ends up producing one of these per + * channel of centroid input, so it's good to clean them up. + * + * An assumption here is that nothing ever modifies the dispatched pixels + * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware + * dictates that anyway. + */ +bool +fs_visitor::opt_drop_redundant_mov_to_flags() +{ + bool flag_mov_found[2] = {false}; + bool progress = false; + + /* Instructions removed by this pass can only be added if this were true */ + if (!devinfo->needs_unlit_centroid_workaround) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + if (inst->is_control_flow()) { + memset(flag_mov_found, 0, sizeof(flag_mov_found)); + } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) { + if (!flag_mov_found[inst->flag_subreg]) { + flag_mov_found[inst->flag_subreg] = true; + } else { + inst->remove(block); + progress = true; + } + } else if (inst->writes_flag()) { + flag_mov_found[inst->flag_subreg] = false; + } + } + + return progress; +} + void fs_visitor::optimize() { @@ -5052,7 +5583,7 @@ fs_visitor::optimize() bld = fs_builder(this, 64); assign_constant_locations(); - demote_pull_constants(); + lower_constant_loads(); validate(); @@ -5079,7 +5610,7 @@ fs_visitor::optimize() if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { char filename[64]; - snprintf(filename, 64, "%s%d-%s-00-start", + snprintf(filename, 64, "%s%d-%s-00-00-start", stage_abbrev, dispatch_width, nir->info.name); backend_shader::dump_instructions(filename); @@ -5089,6 +5620,8 @@ fs_visitor::optimize() int iteration = 0; int pass_num = 0; + OPT(opt_drop_redundant_mov_to_flags); + OPT(lower_simd_width); OPT(lower_logical_sends); @@ -5129,9 +5662,26 @@ fs_visitor::optimize() OPT(dead_code_eliminate); } + if (OPT(lower_pack)) { + OPT(register_coalesce); + OPT(dead_code_eliminate); + } + + if (OPT(lower_d2x)) { + OPT(opt_copy_propagate); + OPT(dead_code_eliminate); + } + OPT(opt_combine_constants); OPT(lower_integer_multiplication); + if (devinfo->gen <= 5 && OPT(lower_minmax)) { + OPT(opt_cmod_propagation); + OPT(opt_cse); + OPT(opt_copy_propagate); + OPT(dead_code_eliminate); + } + lower_uniform_pull_constant_loads(); validate(); @@ -5144,16 +5694,22 @@ fs_visitor::optimize() void fs_visitor::fixup_3src_null_dest() { + bool progress = false; + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { - if (inst->is_3src() && inst->dst.is_null()) { + if (inst->is_3src(devinfo) && inst->dst.is_null()) { inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); + progress = true; } } + + if (progress) + invalidate_live_intervals(); } void -fs_visitor::allocate_registers() +fs_visitor::allocate_registers(bool allow_spilling) { bool allocated_without_spills; @@ -5163,6 +5719,8 @@ fs_visitor::allocate_registers() SCHEDULE_PRE_LIFO, }; + bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS); + /* Try each scheduling heuristic to see if it can successfully register * allocate without spilling. They should be ordered by decreasing * performance but increasing likelihood of allocating. @@ -5174,7 +5732,7 @@ fs_visitor::allocate_registers() assign_regs_trivial(); allocated_without_spills = true; } else { - allocated_without_spills = assign_regs(false); + allocated_without_spills = assign_regs(false, spill_all); } if (allocated_without_spills) break; @@ -5185,7 +5743,7 @@ fs_visitor::allocate_registers() * SIMD8. There's probably actually some intermediate point where * SIMD16 with a couple of spills is still better. */ - if (dispatch_width == 16) { + if (dispatch_width == 16 && min_dispatch_width <= 8) { fail("Failure to register allocate. Reduce number of " "live scalar values to avoid this."); } else { @@ -5199,12 +5757,14 @@ fs_visitor::allocate_registers() /* Since we're out of heuristics, just go spill registers until we * get an allocation. */ - while (!assign_regs(true)) { + while (!assign_regs(true, spill_all)) { if (failed) break; } } + assert(last_scratch == 0 || allow_spilling); + /* This must come after all optimization and register allocation, since * it inserts dead code that happens to have side effects, and it does * so based on the actual physical registers in use. @@ -5250,7 +5810,89 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) assign_vs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(); + allocate_registers(true); + + return !failed; +} + +bool +fs_visitor::run_tcs_single_patch() +{ + assert(stage == MESA_SHADER_TESS_CTRL); + + struct brw_tcs_prog_data *tcs_prog_data = + (struct brw_tcs_prog_data *) prog_data; + + /* r1-r4 contain the ICP handles. */ + payload.num_regs = 5; + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + /* Initialize gl_InvocationID */ + fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW); + fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210))); + bld.MOV(channels_ud, channels_uw); + + if (tcs_prog_data->instances == 1) { + invocation_id = channels_ud; + } else { + invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD); + + /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */ + fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD); + fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)), + brw_imm_ud(INTEL_MASK(23, 17))); + bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3)); + + bld.ADD(invocation_id, instance_times_8, channels_ud); + } + + /* Fix the disptach mask */ + if (nir->info.tcs.vertices_out % 8) { + bld.CMP(bld.null_reg_ud(), invocation_id, + brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L); + bld.IF(BRW_PREDICATE_NORMAL); + } + + emit_nir_code(); + + if (nir->info.tcs.vertices_out % 8) { + bld.emit(BRW_OPCODE_ENDIF); + } + + /* Emit EOT write; set TR DS Cache bit */ + fs_reg srcs[3] = { + fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), + fs_reg(brw_imm_ud(WRITEMASK_X << 16)), + fs_reg(brw_imm_ud(0)), + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); + bld.LOAD_PAYLOAD(payload, srcs, 3, 2); + + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED, + bld.null_reg_ud(), payload); + inst->mlen = 3; + inst->base_mrf = -1; + inst->eot = true; + + if (shader_time_index >= 0) + emit_shader_time_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tcs_single_patch_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(true); return !failed; } @@ -5284,7 +5926,7 @@ fs_visitor::run_tes() assign_tes_urb_setup(); fixup_3src_null_dest(); - allocate_registers(); + allocate_registers(true); return !failed; } @@ -5333,13 +5975,13 @@ fs_visitor::run_gs() assign_gs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(); + allocate_registers(true); return !failed; } bool -fs_visitor::run_fs(bool do_rep_send) +fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) { brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; @@ -5403,17 +6045,12 @@ fs_visitor::run_fs(bool do_rep_send) assign_urb_setup(); fixup_3src_null_dest(); - allocate_registers(); + allocate_registers(allow_spilling); if (failed) return false; } - if (dispatch_width == 8) - wm_prog_data->reg_blocks = brw_register_blocks(grf_used); - else - wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used); - return !failed; } @@ -5427,6 +6064,13 @@ fs_visitor::run_cs() if (shader_time_index >= 0) emit_shader_time_begin(); + if (devinfo->is_haswell && prog_data->total_shared > 0) { + /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ + const fs_builder abld = bld.exec_all().group(1, 0); + abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW), + suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1)); + } + emit_nir_code(); if (failed) @@ -5444,7 +6088,7 @@ fs_visitor::run_cs() assign_curb_setup(); fixup_3src_null_dest(); - allocate_registers(); + allocate_registers(true); if (failed) return false; @@ -5517,6 +6161,31 @@ brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, return barycentric_interp_modes; } +static void +brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, + bool shade_model_flat, const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_variable(var, &shader->inputs) { + enum glsl_interp_qualifier interp_qualifier = + (enum glsl_interp_qualifier)var->data.interpolation; + bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) || + (var->data.location == VARYING_SLOT_COL1); + + int input_index = prog_data->urb_setup[var->data.location]; + + if (input_index < 0) + continue; + + /* flat shading */ + if (interp_qualifier == INTERP_QUALIFIER_FLAT || + (shade_model_flat && is_gl_Color && + interp_qualifier == INTERP_QUALIFIER_NONE)) + prog_data->flat_inputs |= (1 << input_index); + } +} + static uint8_t computed_depth_mode(const nir_shader *shader) { @@ -5544,6 +6213,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, const nir_shader *src_shader, struct gl_program *prog, int shader_time_index8, int shader_time_index16, + bool allow_spilling, bool use_rep_send, unsigned *final_assembly_size, char **error_str) @@ -5551,66 +6221,109 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, true); + brw_nir_lower_fs_inputs(shader); + brw_nir_lower_fs_outputs(shader); shader = brw_postprocess_nir(shader, compiler->devinfo, true); /* key->alpha_test_func means simulating alpha testing via discards, * so the shader definitely kills pixels. */ prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func; - prog_data->uses_omask = + prog_data->uses_omask = key->multisample_fbo && shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); prog_data->computed_depth_mode = computed_depth_mode(shader); prog_data->computed_stencil = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + prog_data->persample_dispatch = + key->multisample_fbo && + (key->persample_interp || + (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | + SYSTEM_BIT_SAMPLE_POS)) || + shader->info.fs.uses_sample_qualifier); + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; prog_data->barycentric_interp_modes = brw_compute_barycentric_interp_modes(compiler->devinfo, key->flat_shade, - key->persample_shading, + key->persample_interp, shader); - fs_visitor v(compiler, log_data, mem_ctx, key, - &prog_data->base, prog, shader, 8, - shader_time_index8); - if (!v.run_fs(false /* do_rep_send */)) { + cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL; + uint8_t simd8_grf_start = 0, simd16_grf_start = 0; + unsigned simd8_grf_used = 0, simd16_grf_used = 0; + + fs_visitor v8(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 8, + shader_time_index8); + if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) { if (error_str) - *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + *error_str = ralloc_strdup(mem_ctx, v8.fail_msg); return NULL; + } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { + simd8_cfg = v8.cfg; + simd8_grf_start = v8.payload.num_regs; + simd8_grf_used = v8.grf_used; } - cfg_t *simd16_cfg = NULL; - fs_visitor v2(compiler, log_data, mem_ctx, key, - &prog_data->base, prog, shader, 16, - shader_time_index16); - if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { - if (!v.simd16_unsupported) { - /* Try a SIMD16 compile */ - v2.import_uniforms(&v); - if (!v2.run_fs(use_rep_send)) { - compiler->shader_perf_log(log_data, - "SIMD16 shader failed to compile: %s", - v2.fail_msg); - } else { - simd16_cfg = v2.cfg; - } + if (!v8.simd16_unsupported && + likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { + /* Try a SIMD16 compile */ + fs_visitor v16(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 16, + shader_time_index16); + v16.import_uniforms(&v8); + if (!v16.run_fs(allow_spilling, use_rep_send)) { + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v16.fail_msg); + } else { + simd16_cfg = v16.cfg; + simd16_grf_start = v16.payload.num_regs; + simd16_grf_used = v16.grf_used; } } - cfg_t *simd8_cfg; - int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send; - if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) { + /* When the caller requests a repclear shader, they want SIMD16-only */ + if (use_rep_send) simd8_cfg = NULL; - prog_data->no_8 = true; - } else { - simd8_cfg = v.cfg; - prog_data->no_8 = false; + + /* Prior to Iron Lake, the PS had a single shader offset with a jump table + * at the top to select the shader. We've never implemented that. + * Instead, we just give them exactly one shader and we pick the widest one + * available. + */ + if (compiler->devinfo->gen < 5 && simd16_cfg) + simd8_cfg = NULL; + + if (prog_data->persample_dispatch) { + /* Starting with SandyBridge (where we first get MSAA), the different + * pixel dispatch combinations are grouped into classifications A + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware + * generations, the only configurations supporting persample dispatch + * are are this in which only one dispatch width is enabled. + * + * If computed depth is enabled, SNB only allows SIMD8 while IVB+ + * allow SIMD8 or SIMD16 so we choose SIMD16 if available. + */ + if (compiler->devinfo->gen == 6 && + prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) { + simd16_cfg = NULL; + } else if (simd16_cfg) { + simd8_cfg = NULL; + } } + /* We have to compute the flat inputs after the visitor is finished running + * because it relies on prog_data->urb_setup which is computed in + * fs_visitor::calculate_urb_setup(). + */ + brw_compute_flat_inputs(prog_data, key->flat_shade, shader); + fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base, - v.promoted_constants, v.runtime_check_aads_emit, + v8.promoted_constants, v8.runtime_check_aads_emit, MESA_SHADER_FRAGMENT); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { @@ -5620,10 +6333,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, shader->info.name)); } - if (simd8_cfg) + if (simd8_cfg) { + prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8); - if (simd16_cfg) - prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); + prog_data->base.dispatch_grf_start_reg = simd8_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used); + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16); + prog_data->dispatch_grf_start_reg_2 = simd16_grf_start; + prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used); + } + } else if (simd16_cfg) { + prog_data->dispatch_16 = true; + g.generate_code(simd16_cfg, 16); + prog_data->base.dispatch_grf_start_reg = simd16_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used); + } return g.get_assembly(final_assembly_size); } @@ -5678,6 +6405,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, true); + brw_nir_lower_cs_shared(shader); + prog_data->base.total_shared += shader->num_shared; shader = brw_postprocess_nir(shader, compiler->devinfo, true); prog_data->local_size[0] = shader->info.cs.local_size[0]; @@ -5688,6 +6417,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, shader->info.cs.local_size[2]; unsigned max_cs_threads = compiler->devinfo->max_cs_threads; + unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads); cfg_t *cfg = NULL; const char *fail_msg = NULL; @@ -5697,11 +6427,14 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, NULL, /* Never used in core profile */ shader, 8, shader_time_index); - if (!v8.run_cs()) { - fail_msg = v8.fail_msg; - } else if (local_workgroup_size <= 8 * max_cs_threads) { - cfg = v8.cfg; - prog_data->simd_size = 8; + if (simd_required <= 8) { + if (!v8.run_cs()) { + fail_msg = v8.fail_msg; + } else { + cfg = v8.cfg; + prog_data->simd_size = 8; + prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + } } fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base, @@ -5711,7 +6444,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, !fail_msg && !v8.simd16_unsupported && local_workgroup_size <= 16 * max_cs_threads) { /* Try a SIMD16 compile */ - v16.import_uniforms(&v8); + if (simd_required <= 8) + v16.import_uniforms(&v8); if (!v16.run_cs()) { compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", @@ -5724,6 +6458,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, } else { cfg = v16.cfg; prog_data->simd_size = 16; + prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; } }