X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs.cpp;h=0f9de30f05b578785e2d1c7abeb6ddd376e2b62b;hb=89e421369c54a8edc4930352e062f03b5c66b886;hp=777cee5c8093b9238d8b0a75f84a107a3381039c;hpb=6c8ba59cff14a1a86273f4008ff2a8e68335ab25;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 777cee5c809..0f9de30f05b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -28,29 +28,17 @@ * from the LIR. */ -#include - -#include "util/hash_table.h" #include "main/macros.h" -#include "main/shaderobj.h" -#include "main/fbobject.h" -#include "program/prog_parameter.h" -#include "program/prog_print.h" -#include "util/register_allocate.h" -#include "program/hash_table.h" #include "brw_context.h" #include "brw_eu.h" -#include "brw_wm.h" #include "brw_fs.h" #include "brw_cs.h" #include "brw_nir.h" #include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" +#include "brw_program.h" #include "brw_dead_control_flow.h" -#include "main/uniforms.h" -#include "brw_fs_live_variables.h" -#include "glsl/nir/glsl_types.h" -#include "program/sampler.h" +#include "compiler/glsl_types.h" using namespace brw; @@ -186,8 +174,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * CSE can later notice that those loads are all the same and eliminate * the redundant ones. */ - fs_reg vec4_offset = vgrf(glsl_type::int_type); - bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~3)); + fs_reg vec4_offset = vgrf(glsl_type::uint_type); + bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); int scale = 1; if (devinfo->gen == 4 && bld.dispatch_width() == 8) { @@ -219,7 +207,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, inst->mlen = 1 + bld.dispatch_width() / 8; } - bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale)); + bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale)); } /** @@ -300,6 +288,71 @@ fs_inst::is_send_from_grf() const } } +/** + * Returns true if this instruction's sources and destinations cannot + * safely be the same register. + * + * In most cases, a register can be written over safely by the same + * instruction that is its last use. For a single instruction, the + * sources are dereferenced before writing of the destination starts + * (naturally). + * + * However, there are a few cases where this can be problematic: + * + * - Virtual opcodes that translate to multiple instructions in the + * code generator: if src == dst and one instruction writes the + * destination before a later instruction reads the source, then + * src will have been clobbered. + * + * - SIMD16 compressed instructions with certain regioning (see below). + * + * The register allocator uses this information to set up conflicts between + * GRF sources and the destination. + */ +bool +fs_inst::has_source_and_destination_hazard() const +{ + switch (opcode) { + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + /* Multiple partial writes to the destination */ + return true; + default: + /* The SIMD16 compressed instruction + * + * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F + * + * is actually decoded in hardware as: + * + * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F + * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F + * + * Which is safe. However, if we have uniform accesses + * happening, we get into trouble: + * + * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F + * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F + * + * Now our destination for the first instruction overwrote the + * second instruction's src0, and we get garbage for those 8 + * pixels. There's a similar issue for the pre-gen6 + * pixel_x/pixel_y, which are registers of 16-bit values and thus + * would get stomped by the first decode as well. + */ + if (exec_size == 16) { + for (int i = 0; i < sources; i++) { + if (src[i].file == VGRF && (src[i].stride == 0 || + src[i].type == BRW_REGISTER_TYPE_UW || + src[i].type == BRW_REGISTER_TYPE_W || + src[i].type == BRW_REGISTER_TYPE_UB || + src[i].type == BRW_REGISTER_TYPE_B)) { + return true; + } + } + } + return false; + } +} + bool fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const { @@ -375,12 +428,11 @@ fs_reg::fs_reg() this->file = BAD_FILE; } -fs_reg::fs_reg(struct brw_reg reg) : +fs_reg::fs_reg(struct ::brw_reg reg) : backend_reg(reg) { this->reg_offset = 0; this->subreg_offset = 0; - this->reladdr = NULL; this->stride = 1; if (this->file == IMM && (this->type != BRW_REGISTER_TYPE_V && @@ -393,10 +445,8 @@ fs_reg::fs_reg(struct brw_reg reg) : bool fs_reg::equals(const fs_reg &r) const { - return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 && - reg_offset == r.reg_offset && + return (this->backend_reg::equals(r) && subreg_offset == r.subreg_offset && - !reladdr && !r.reladdr && stride == r.stride); } @@ -458,6 +508,7 @@ type_size_scalar(const struct glsl_type *type) case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_FUNCTION: unreachable("not reached"); } @@ -687,18 +738,20 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_LOD_LOGICAL: case SHADER_OPCODE_TG4_LOGICAL: case SHADER_OPCODE_TG4_OFFSET_LOGICAL: - assert(src[8].file == IMM && src[9].file == IMM); + assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM && + src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); /* Texture coordinates. */ - if (i == 0) - return src[8].ud; + if (i == TEX_LOGICAL_SRC_COORDINATE) + return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; /* Texture derivatives. */ - else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL) - return src[9].ud; + else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) && + opcode == SHADER_OPCODE_TXD_LOGICAL) + return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; /* Texture offset. */ - else if (i == 7) + else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE) return 2; /* MCS */ - else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) + else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) return 2; else return 1; @@ -798,7 +851,10 @@ fs_inst::regs_read(int arg) const assert(src[2].file == IMM); unsigned region_length = src[2].ud; - if (src[0].file == FIXED_GRF) { + if (src[0].file == UNIFORM) { + assert(region_length % 4 == 0); + return region_length / 4; + } else if (src[0].file == FIXED_GRF) { /* If the start of the region is not register aligned, then * there's some portion of the register that's technically * unread at the beginning. @@ -812,7 +868,7 @@ fs_inst::regs_read(int arg) const * unread portion at the beginning. */ if (src[0].subnr) - region_length += src[0].subnr * type_sz(src[0].type); + region_length += src[0].subnr; return DIV_ROUND_UP(region_length, REG_SIZE); } else { @@ -968,7 +1024,6 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; - this->param_size = v->param_size; } fs_reg * @@ -1056,33 +1111,19 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, } void -fs_visitor::emit_general_interpolation(fs_reg attr, const char *name, +fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name, const glsl_type *type, glsl_interp_qualifier interpolation_mode, - int location, bool mod_centroid, + int *location, bool mod_centroid, bool mod_sample) { - attr.type = brw_type_for_base_type(type->get_scalar_type()); - assert(stage == MESA_SHADER_FRAGMENT); brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - unsigned int array_elements; - - if (type->is_array()) { - array_elements = type->arrays_of_arrays_size(); - if (array_elements == 0) { - fail("dereferenced array '%s' has length 0\n", name); - } - type = type->without_array(); - } else { - array_elements = 1; - } - if (interpolation_mode == INTERP_QUALIFIER_NONE) { bool is_gl_Color = - location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1; + *location == VARYING_SLOT_COL0 || *location == VARYING_SLOT_COL1; if (key->flat_shade && is_gl_Color) { interpolation_mode = INTERP_QUALIFIER_FLAT; } else { @@ -1090,71 +1131,86 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name, } } - for (unsigned int i = 0; i < array_elements; i++) { - for (unsigned int j = 0; j < type->matrix_columns; j++) { - if (prog_data->urb_setup[location] == -1) { - /* If there's no incoming setup data for this slot, don't - * emit interpolation for it. - */ - attr = offset(attr, bld, type->vector_elements); - location++; - continue; - } + if (type->is_array() || type->is_matrix()) { + const glsl_type *elem_type = glsl_get_array_element(type); + const unsigned length = glsl_get_length(type); - if (interpolation_mode == INTERP_QUALIFIER_FLAT) { - /* Constant interpolation (flat shading) case. The SF has - * handed us defined values in only the constant offset - * field of the setup reg. - */ - for (unsigned int k = 0; k < type->vector_elements; k++) { - struct brw_reg interp = interp_reg(location, k); - interp = suboffset(interp, 3); - interp.type = attr.type; - bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); - attr = offset(attr, bld, 1); - } - } else { - /* Smooth/noperspective interpolation case. */ - for (unsigned int k = 0; k < type->vector_elements; k++) { - struct brw_reg interp = interp_reg(location, k); - if (devinfo->needs_unlit_centroid_workaround && mod_centroid) { - /* Get the pixel/sample mask into f0 so that we know - * which pixels are lit. Then, for each channel that is - * unlit, replace the centroid data with non-centroid - * data. - */ - bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); - - fs_inst *inst; - inst = emit_linterp(attr, fs_reg(interp), interpolation_mode, - false, false); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = true; - if (devinfo->has_pln) - inst->no_dd_clear = true; - - inst = emit_linterp(attr, fs_reg(interp), interpolation_mode, - mod_centroid && !key->persample_shading, - mod_sample || key->persample_shading); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = false; - if (devinfo->has_pln) - inst->no_dd_check = true; + for (unsigned i = 0; i < length; i++) { + emit_general_interpolation(attr, name, elem_type, interpolation_mode, + location, mod_centroid, mod_sample); + } + } else if (type->is_record()) { + for (unsigned i = 0; i < type->length; i++) { + const glsl_type *field_type = type->fields.structure[i].type; + emit_general_interpolation(attr, name, field_type, interpolation_mode, + location, mod_centroid, mod_sample); + } + } else { + assert(type->is_scalar() || type->is_vector()); - } else { - emit_linterp(attr, fs_reg(interp), interpolation_mode, - mod_centroid && !key->persample_shading, - mod_sample || key->persample_shading); - } - if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) { - bld.MUL(attr, attr, this->pixel_w); - } - attr = offset(attr, bld, 1); - } + if (prog_data->urb_setup[*location] == -1) { + /* If there's no incoming setup data for this slot, don't + * emit interpolation for it. + */ + *attr = offset(*attr, bld, type->vector_elements); + (*location)++; + return; + } - } - location++; + attr->type = brw_type_for_base_type(type->get_scalar_type()); + + if (interpolation_mode == INTERP_QUALIFIER_FLAT) { + /* Constant interpolation (flat shading) case. The SF has + * handed us defined values in only the constant offset + * field of the setup reg. + */ + for (unsigned int i = 0; i < type->vector_elements; i++) { + struct brw_reg interp = interp_reg(*location, i); + interp = suboffset(interp, 3); + interp.type = attr->type; + bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp)); + *attr = offset(*attr, bld, 1); + } + } else { + /* Smooth/noperspective interpolation case. */ + for (unsigned int i = 0; i < type->vector_elements; i++) { + struct brw_reg interp = interp_reg(*location, i); + if (devinfo->needs_unlit_centroid_workaround && mod_centroid) { + /* Get the pixel/sample mask into f0 so that we know + * which pixels are lit. Then, for each channel that is + * unlit, replace the centroid data with non-centroid + * data. + */ + bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + + fs_inst *inst; + inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode, + false, false); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + if (devinfo->has_pln) + inst->no_dd_clear = true; + + inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode, + mod_centroid && !key->persample_shading, + mod_sample || key->persample_shading); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + if (devinfo->has_pln) + inst->no_dd_check = true; + + } else { + emit_linterp(*attr, fs_reg(interp), interpolation_mode, + mod_centroid && !key->persample_shading, + mod_sample || key->persample_shading); + } + if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) { + bld.MUL(*attr, *attr, this->pixel_w); + } + *attr = offset(*attr, bld, 1); + } } + (*location)++; } } @@ -1617,9 +1673,6 @@ fs_visitor::assign_vs_urb_setup() brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; assert(stage == MESA_SHADER_VERTEX); - int count = _mesa_bitcount_64(vs_prog_data->inputs_read); - if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) - count++; /* Each attribute is 4 regs. */ this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes; @@ -1632,6 +1685,21 @@ fs_visitor::assign_vs_urb_setup() } } +void +fs_visitor::assign_tes_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; + + first_non_payload_grf += 8 * vue_prog_data->urb_read_length; + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + void fs_visitor::assign_gs_urb_setup() { @@ -1858,9 +1926,7 @@ fs_visitor::compact_virtual_grfs() * maximum number of fragment shader uniform components (64). If * there are too many of these, they'd fill up all of register space. * So, this will push some of them out to the pull constant buffer and - * update the program to load them. We also use pull constants for all - * indirect constant loads because we don't support indirect accesses in - * registers yet. + * update the program to load them. */ void fs_visitor::assign_constant_locations() @@ -1869,20 +1935,21 @@ fs_visitor::assign_constant_locations() if (dispatch_width != 8) return; - unsigned int num_pull_constants = 0; - - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms); - bool is_live[uniforms]; memset(is_live, 0, sizeof(is_live)); + /* For each uniform slot, a value of true indicates that the given slot and + * the next slot must remain contiguous. This is used to keep us from + * splitting arrays apart. + */ + bool contiguous[uniforms]; + memset(contiguous, 0, sizeof(contiguous)); + /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. * - * 2) Find all indirect access of uniform arrays and flag them as needing - * to go into the pull constant buffer. + * 2) Mark any indirectly used ranges of registers as contiguous. * * Note that we don't move constant-indexed accesses to arrays. No * testing has been done of the performance impact of this choice. @@ -1892,20 +1959,19 @@ fs_visitor::assign_constant_locations() if (inst->src[i].file != UNIFORM) continue; - if (inst->src[i].reladdr) { - int uniform = inst->src[i].nr; + int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; - /* If this array isn't already present in the pull constant buffer, - * add it. - */ - if (pull_constant_loc[uniform] == -1) { - assert(param_size[uniform]); - for (int j = 0; j < param_size[uniform]; j++) - pull_constant_loc[uniform + j] = num_pull_constants++; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + assert(inst->src[2].ud % 4 == 0); + unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; + assert(last < uniforms); + + for (unsigned j = constant_nr; j < last; j++) { + is_live[j] = true; + contiguous[j] = true; } + is_live[last] = true; } else { - /* Mark the the one accessed uniform as live */ - int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; if (constant_nr >= 0 && constant_nr < (int) uniforms) is_live[constant_nr] = true; } @@ -1920,29 +1986,48 @@ fs_visitor::assign_constant_locations() * If changing this value, note the limitation about total_regs in * brw_curbe.c. */ - unsigned int max_push_components = 16 * 8; + const unsigned int max_push_components = 16 * 8; + + /* For vulkan we don't limit the max_chunk_size. We set it to 32 float = + * 128 bytes, which is the maximum vulkan push constant size. + */ + const unsigned int max_chunk_size = 32; + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - for (unsigned int i = 0; i < uniforms; i++) { - if (!is_live[i] || pull_constant_loc[i] != -1) { - /* This UNIFORM register is either dead, or has already been demoted - * to a pull const. Mark it as no longer living in the param[] array. - */ - push_constant_loc[i] = -1; + int chunk_start = -1; + for (unsigned u = 0; u < uniforms; u++) { + push_constant_loc[u] = -1; + pull_constant_loc[u] = -1; + + if (!is_live[u]) continue; - } - if (num_push_constants < max_push_components) { - /* Retain as a push constant. Record the location in the params[] - * array. - */ - push_constant_loc[i] = num_push_constants++; - } else { - /* Demote to a pull constant. */ - push_constant_loc[i] = -1; - pull_constant_loc[i] = num_pull_constants++; + /* This is the first live uniform in the chunk */ + if (chunk_start < 0) + chunk_start = u; + + /* If this element does not need to be contiguous with the next, we + * split at this point and everthing between chunk_start and u forms a + * single chunk. + */ + if (!contiguous[u]) { + unsigned chunk_size = u - chunk_start + 1; + + if (num_push_constants + chunk_size <= max_push_components && + chunk_size <= max_chunk_size) { + for (unsigned j = chunk_start; j <= u; j++) + push_constant_loc[j] = num_push_constants++; + } else { + for (unsigned j = chunk_start; j <= u; j++) + pull_constant_loc[j] = num_pull_constants++; + } + + chunk_start = -1; } } @@ -1973,51 +2058,67 @@ fs_visitor::assign_constant_locations() * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. */ void -fs_visitor::demote_pull_constants() +fs_visitor::lower_constant_loads() { - foreach_block_and_inst (block, fs_inst, inst, cfg) { + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(this, block, inst); + for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != UNIFORM) continue; - int pull_index; + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + unsigned location = inst->src[i].nr + inst->src[i].reg_offset; - if (location >= uniforms) /* Out of bounds access */ - pull_index = -1; - else - pull_index = pull_constant_loc[location]; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; if (pull_index == -1) continue; - /* Set up the annotation tracking for new generated instructions. */ - const fs_builder ibld(this, block, inst); - const unsigned index = stage_prog_data->binding_table.pull_constants_start; - fs_reg dst = vgrf(glsl_type::float_type); - assert(inst->src[i].stride == 0); - /* Generate a pull load into dst. */ - if (inst->src[i].reladdr) { - VARYING_PULL_CONSTANT_LOAD(ibld, dst, - brw_imm_ud(index), - *inst->src[i].reladdr, - pull_index); - inst->src[i].reladdr = NULL; - inst->src[i].stride = 1; - } else { - const fs_builder ubld = ibld.exec_all().group(8, 0); - struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), offset); - inst->src[i].set_smear(pull_index & 3); - } - brw_mark_surface_used(prog_data, index); + fs_reg dst = vgrf(glsl_type::float_type); + const fs_builder ubld = ibld.exec_all().group(8, 0); + struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + dst, brw_imm_ud(index), offset); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; inst->src[i].reg_offset = 0; + inst->src[i].set_smear(pull_index & 3); + + brw_mark_surface_used(prog_data, index); + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + unsigned location = inst->src[0].nr + inst->src[0].reg_offset; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; + + if (pull_index == -1) + continue; + + VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + inst->src[1], + pull_index * 4); + inst->remove(block); + + brw_mark_surface_used(prog_data, index); } } invalidate_live_intervals(); @@ -2038,7 +2139,8 @@ fs_visitor::opt_algebraic() if (inst->dst.type != inst->src[0].type) assert(!"unimplemented: saturate mixed types"); - if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) { + if (brw_saturate_immediate(inst->dst.type, + &inst->src[0].as_brw_reg())) { inst->saturate = false; progress = true; } @@ -2726,10 +2828,23 @@ fs_visitor::emit_repclear_shader() brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; int base_mrf = 1; int color_mrf = base_mrf + 2; + fs_inst *mov; - fs_inst *mov = bld.exec_all().group(4, 0) - .MOV(brw_message_reg(color_mrf), - fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + if (uniforms == 1) { + mov = bld.exec_all().group(4, 0) + .MOV(brw_message_reg(color_mrf), + fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + } else { + struct brw_reg reg = + brw_reg(BRW_GENERAL_REGISTER_FILE, + 2, 3, 0, 0, BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_8, + BRW_WIDTH_2, + BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + + mov = bld.exec_all().group(4, 0) + .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)); + } fs_inst *write; if (key->nr_color_regions == 1) { @@ -2758,8 +2873,10 @@ fs_visitor::emit_repclear_shader() assign_curb_setup(); /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ - assert(mov->src[0].file == FIXED_GRF); - mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + if (uniforms == 1) { + assert(mov->src[0].file == FIXED_GRF); + mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + } } /** @@ -3038,13 +3155,11 @@ fs_visitor::lower_uniform_pull_constant_loads() continue; if (devinfo->gen >= 7) { - /* The offset arg before was a vec4-aligned byte offset. We need to - * turn it into a dword offset. - */ + /* The offset arg is a vec4-aligned immediate byte offset. */ fs_reg const_offset_reg = inst->src[1]; assert(const_offset_reg.file == IMM && const_offset_reg.type == BRW_REGISTER_TYPE_UD); - const_offset_reg.ud /= 4; + assert(const_offset_reg.ud % 16 == 0); fs_reg payload, offset; if (devinfo->gen >= 9) { @@ -3369,8 +3484,7 @@ fs_visitor::lower_integer_multiplication() */ assert(mul->src[1].type == BRW_REGISTER_TYPE_D || mul->src[1].type == BRW_REGISTER_TYPE_UD); - mul->src[1].type = (type_is_signed(mul->src[1].type) ? - BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW); + mul->src[1].type = BRW_REGISTER_TYPE_UW; mul->src[1].stride *= 2; } else if (devinfo->gen == 7 && !devinfo->is_haswell && @@ -3410,6 +3524,36 @@ fs_visitor::lower_integer_multiplication() return progress; } +bool +fs_visitor::lower_minmax() +{ + assert(devinfo->gen < 6); + + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == BRW_OPCODE_SEL && + inst->predicate == BRW_PREDICATE_NONE) { + /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of + * the original SEL.L/GE instruction + */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -3588,6 +3732,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &coordinate, const fs_reg &shadow_c, const fs_reg &lod, const fs_reg &lod2, + const fs_reg &surface, const fs_reg &sampler, unsigned coord_components, unsigned grad_components) @@ -3680,8 +3825,9 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, inst->opcode = op; inst->src[0] = reg_undef; - inst->src[1] = sampler; - inst->resize_sources(2); + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); inst->base_mrf = msg_begin.nr; inst->mlen = msg_end.nr - msg_begin.nr; inst->header_size = 1; @@ -3693,6 +3839,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &shadow_c, fs_reg lod, fs_reg lod2, const fs_reg &sample_index, + const fs_reg &surface, const fs_reg &sampler, const fs_reg &offset_value, unsigned coord_components, @@ -3775,8 +3922,9 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, inst->opcode = op; inst->src[0] = reg_undef; - inst->src[1] = sampler; - inst->resize_sources(2); + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); inst->base_mrf = message.nr; inst->mlen = msg_end.nr - message.nr; inst->header_size = header_size; @@ -3800,7 +3948,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &shadow_c, fs_reg lod, fs_reg lod2, const fs_reg &sample_index, - const fs_reg &mcs, const fs_reg &sampler, + const fs_reg &mcs, + const fs_reg &surface, + const fs_reg &sampler, fs_reg offset_value, unsigned coord_components, unsigned grad_components) @@ -4003,8 +4153,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, /* Generate the SEND. */ inst->opcode = op; inst->src[0] = src_payload; - inst->src[1] = sampler; - inst->resize_sources(2); + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); inst->base_mrf = -1; inst->mlen = mlen; inst->header_size = header_size; @@ -4017,31 +4168,34 @@ static void lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) { const brw_device_info *devinfo = bld.shader->devinfo; - const fs_reg &coordinate = inst->src[0]; - const fs_reg &shadow_c = inst->src[1]; - const fs_reg &lod = inst->src[2]; - const fs_reg &lod2 = inst->src[3]; - const fs_reg &sample_index = inst->src[4]; - const fs_reg &mcs = inst->src[5]; - const fs_reg &sampler = inst->src[6]; - const fs_reg &offset_value = inst->src[7]; - assert(inst->src[8].file == IMM && inst->src[9].file == IMM); - const unsigned coord_components = inst->src[8].ud; - const unsigned grad_components = inst->src[9].ud; + const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE]; + const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; + const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD]; + const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2]; + const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX]; + const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS]; + const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE]; + const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER]; + const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE]; + assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM); + const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; + assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); + const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; if (devinfo->gen >= 7) { lower_sampler_logical_send_gen7(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index, - mcs, sampler, offset_value, + mcs, surface, sampler, offset_value, coord_components, grad_components); } else if (devinfo->gen >= 5) { lower_sampler_logical_send_gen5(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index, - sampler, offset_value, + surface, sampler, offset_value, coord_components, grad_components); } else { lower_sampler_logical_send_gen4(bld, inst, op, coordinate, - shadow_c, lod, lod2, sampler, + shadow_c, lod, lod2, + surface, sampler, coord_components, grad_components); } } @@ -4321,7 +4475,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case SHADER_OPCODE_TG4_OFFSET_LOGICAL: { /* gather4_po_c is unsupported in SIMD16 mode. */ - const fs_reg &shadow_c = inst->src[1]; + const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size); } case SHADER_OPCODE_TXL_LOGICAL: @@ -4330,7 +4484,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16 * mode because the message exceeds the maximum length of 11. */ - const fs_reg &shadow_c = inst->src[1]; + const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; if (devinfo->gen == 4 && shadow_c.file == BAD_FILE) return 16; else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE) @@ -4353,7 +4507,8 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, * circumstances it can end up with a message that is too long in SIMD16 * mode. */ - const unsigned coord_components = inst->src[8].ud; + const unsigned coord_components = + inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; /* First three arguments are the sample index and the two arguments for * the MCS data. */ @@ -4368,6 +4523,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: return 8; + case SHADER_OPCODE_MOV_INDIRECT: + /* Prior to Broadwell, we only have 8 address subregisters */ + return devinfo->gen < 8 ? 8 : inst->exec_size; + default: return inst->exec_size; } @@ -4650,9 +4809,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) break; case UNIFORM: fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset); - if (inst->src[i].reladdr) { - fprintf(file, "+reladdr"); - } else if (inst->src[i].subreg_offset) { + if (inst->src[i].subreg_offset) { fprintf(file, "+%d.%d", inst->src[i].reg_offset, inst->src[i].subreg_offset); } @@ -4663,7 +4820,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) case IMM: switch (inst->src[i].type) { case BRW_REGISTER_TYPE_F: - fprintf(file, "%ff", inst->src[i].f); + fprintf(file, "%-gf", inst->src[i].f); break; case BRW_REGISTER_TYPE_W: case BRW_REGISTER_TYPE_D: @@ -4763,7 +4920,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, { if (end == start || end->is_partial_write() || - reg.reladdr || !reg.equals(end->dst)) { return NULL; } else { @@ -4772,10 +4928,12 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, } void -fs_visitor::setup_payload_gen6() +fs_visitor::setup_fs_payload_gen6() { - bool uses_depth = - (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; + assert(stage == MESA_SHADER_FRAGMENT); + brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + unsigned barycentric_interp_modes = (stage == MESA_SHADER_FRAGMENT) ? ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0; @@ -4804,7 +4962,9 @@ fs_visitor::setup_payload_gen6() } /* R27: interpolated depth if uses source depth */ - if (uses_depth) { + prog_data->uses_src_depth = + (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (prog_data->uses_src_depth) { payload.source_depth_reg = payload.num_regs; payload.num_regs++; if (dispatch_width == 16) { @@ -4812,8 +4972,11 @@ fs_visitor::setup_payload_gen6() payload.num_regs++; } } + /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */ - if (uses_depth) { + prog_data->uses_src_w = + (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; + if (prog_data->uses_src_w) { payload.source_w_reg = payload.num_regs; payload.num_regs++; if (dispatch_width == 16) { @@ -4822,19 +4985,17 @@ fs_visitor::setup_payload_gen6() } } - if (stage == MESA_SHADER_FRAGMENT) { - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - prog_data->uses_pos_offset = key->compute_pos_offset; - /* R31: MSAA position offsets. */ - if (prog_data->uses_pos_offset) { - payload.sample_pos_reg = payload.num_regs; - payload.num_regs++; - } + prog_data->uses_pos_offset = key->compute_pos_offset; + /* R31: MSAA position offsets. */ + if (prog_data->uses_pos_offset) { + payload.sample_pos_reg = payload.num_regs; + payload.num_regs++; } /* R32: MSAA input coverage mask */ - if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) { + prog_data->uses_sample_mask = + (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0; + if (prog_data->uses_sample_mask) { assert(devinfo->gen >= 7); payload.sample_mask_in_reg = payload.num_regs; payload.num_regs++; @@ -4976,7 +5137,7 @@ fs_visitor::optimize() bld = fs_builder(this, 64); assign_constant_locations(); - demote_pull_constants(); + lower_constant_loads(); validate(); @@ -5003,7 +5164,7 @@ fs_visitor::optimize() if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { char filename[64]; - snprintf(filename, 64, "%s%d-%s-00-start", + snprintf(filename, 64, "%s%d-%s-00-00-start", stage_abbrev, dispatch_width, nir->info.name); backend_shader::dump_instructions(filename); @@ -5056,6 +5217,13 @@ fs_visitor::optimize() OPT(opt_combine_constants); OPT(lower_integer_multiplication); + if (devinfo->gen <= 5 && OPT(lower_minmax)) { + OPT(opt_cmod_propagation); + OPT(opt_cse); + OPT(opt_copy_propagate); + OPT(dead_code_eliminate); + } + lower_uniform_pull_constant_loads(); validate(); @@ -5179,6 +5347,40 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) return !failed; } +bool +fs_visitor::run_tes() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */ + payload.num_regs = 5; + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + if (failed) + return false; + + emit_urb_writes(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tes_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(); + + return !failed; +} + bool fs_visitor::run_gs() { @@ -5237,9 +5439,9 @@ fs_visitor::run_fs(bool do_rep_send) assert(stage == MESA_SHADER_FRAGMENT); if (devinfo->gen >= 6) - setup_payload_gen6(); + setup_fs_payload_gen6(); else - setup_payload_gen4(); + setup_fs_payload_gen4(); if (0) { emit_dummy_fs(); @@ -5441,6 +5643,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, true); + brw_nir_lower_fs_inputs(shader); + brw_nir_lower_fs_outputs(shader); shader = brw_postprocess_nir(shader, compiler->devinfo, true); /* key->alpha_test_func means simulating alpha testing via discards, @@ -5500,7 +5704,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, } fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base, - v.promoted_constants, v.runtime_check_aads_emit, "FS"); + v.promoted_constants, v.runtime_check_aads_emit, + MESA_SHADER_FRAGMENT); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", @@ -5517,42 +5722,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, return g.get_assembly(final_assembly_size); } -void -brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, - void *buffer, uint32_t threads, uint32_t stride) -{ - if (prog_data->local_invocation_id_regs == 0) - return; - - /* 'stride' should be an integer number of registers, that is, a multiple - * of 32 bytes. - */ - assert(stride % 32 == 0); - - unsigned x = 0, y = 0, z = 0; - for (unsigned t = 0; t < threads; t++) { - uint32_t *param = (uint32_t *) buffer + stride * t / 4; - - for (unsigned i = 0; i < prog_data->simd_size; i++) { - param[0 * prog_data->simd_size + i] = x; - param[1 * prog_data->simd_size + i] = y; - param[2 * prog_data->simd_size + i] = z; - - x++; - if (x == prog_data->local_size[0]) { - x = 0; - y++; - if (y == prog_data->local_size[1]) { - y = 0; - z++; - if (z == prog_data->local_size[2]) - z = 0; - } - } - } - } -} - fs_reg * fs_visitor::emit_cs_local_invocation_id_setup() { @@ -5661,7 +5830,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, } fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base, - v8.promoted_constants, v8.runtime_check_aads_emit, "CS"); + v8.promoted_constants, v8.runtime_check_aads_emit, + MESA_SHADER_COMPUTE); if (INTEL_DEBUG & DEBUG_CS) { char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", shader->info.label ? shader->info.label : @@ -5674,3 +5844,39 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, return g.get_assembly(final_assembly_size); } + +void +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, + void *buffer, uint32_t threads, uint32_t stride) +{ + if (prog_data->local_invocation_id_regs == 0) + return; + + /* 'stride' should be an integer number of registers, that is, a multiple + * of 32 bytes. + */ + assert(stride % 32 == 0); + + unsigned x = 0, y = 0, z = 0; + for (unsigned t = 0; t < threads; t++) { + uint32_t *param = (uint32_t *) buffer + stride * t / 4; + + for (unsigned i = 0; i < prog_data->simd_size; i++) { + param[0 * prog_data->simd_size + i] = x; + param[1 * prog_data->simd_size + i] = y; + param[2 * prog_data->simd_size + i] = z; + + x++; + if (x == prog_data->local_size[0]) { + x = 0; + y++; + if (y == prog_data->local_size[1]) { + y = 0; + z++; + if (z == prog_data->local_size[2]) + z = 0; + } + } + } + } +}