X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs.cpp;h=baaa25c13473171ae963b150c021cd92f908653b;hb=e290372542d0475e612e4d10a27b22eae3158ecd;hp=f3232b292719dc6b80edc2da0d9f13af24f77433;hpb=aebd3f46e305829ebfcc817cafa8592edc2f80ab;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f3232b29271..baaa25c1347 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -32,6 +32,7 @@ extern "C" { #include +#include "main/hash_table.h" #include "main/macros.h" #include "main/shaderobj.h" #include "main/uniforms.h" @@ -60,6 +61,9 @@ fs_inst::init() this->src[0] = reg_undef; this->src[1] = reg_undef; this->src[2] = reg_undef; + + /* This will be the case for almost all instructions. */ + this->regs_written = 1; } fs_inst::fs_inst() @@ -146,6 +150,13 @@ fs_inst::fs_inst(enum opcode opcode, fs_reg dst, return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \ } +#define ALU3(op) \ + fs_inst * \ + fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \ + { \ + return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\ + } + ALU1(NOT) ALU1(MOV) ALU1(FRC) @@ -161,6 +172,14 @@ ALU2(XOR) ALU2(SHL) ALU2(SHR) ALU2(ASR) +ALU3(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) /** Gen4 predicated IF. */ fs_inst * @@ -206,7 +225,7 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition) */ if (intel->gen == 4) { dst.type = src0.type; - if (dst.file == FIXED_HW_REG) + if (dst.file == HW_REG) dst.fixed_hw_reg.type = dst.type; } @@ -221,40 +240,59 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition) exec_list fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index, - fs_reg offset) + fs_reg varying_offset, + uint32_t const_offset) { exec_list instructions; fs_inst *inst; - if (intel->gen >= 7) { - inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, - dst, surf_index, offset); - instructions.push_tail(inst); - } else { - int base_mrf = 13; - bool header_present = true; - - fs_reg mrf = fs_reg(MRF, base_mrf + header_present); - mrf.type = BRW_REGISTER_TYPE_D; - - /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a - * dword-aligned byte offset. + /* We have our constant surface use a pitch of 4 bytes, so our index can + * be any component of a vector, and then we load 4 contiguous + * components starting from that. + * + * We break down the const_offset to a portion added to the variable + * offset and a portion done using reg_offset, which means that if you + * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = + * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and + * CSE can later notice that those loads are all the same and eliminate + * the redundant ones. + */ + fs_reg vec4_offset = fs_reg(this, glsl_type::int_type); + instructions.push_tail(ADD(vec4_offset, + varying_offset, const_offset & ~3)); + + int scale = 1; + if (intel->gen == 4 && dispatch_width == 8) { + /* Pre-gen5, we can either use a SIMD8 message that requires (header, + * u, v, r) as parameters, or we can just use the SIMD16 message + * consisting of (header, u). We choose the second, at the cost of a + * longer return length. */ - if (intel->gen == 6) { - instructions.push_tail(MOV(mrf, offset)); - } else { - instructions.push_tail(MUL(mrf, offset, fs_reg(4))); - } - inst = MOV(mrf, offset); - inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD, - dst, surf_index); - inst->header_present = header_present; - inst->base_mrf = base_mrf; - inst->mlen = header_present + dispatch_width / 8; + scale = 2; + } - instructions.push_tail(inst); + enum opcode op; + if (intel->gen >= 7) + op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; + else + op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; + fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type); + inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset); + inst->regs_written = 4 * scale; + instructions.push_tail(inst); + + if (intel->gen < 7) { + inst->base_mrf = 13; + inst->header_present = true; + if (intel->gen == 4) + inst->mlen = 3; + else + inst->mlen = 1 + dispatch_width / 8; } + vec4_result.reg_offset += (const_offset & 3) * scale; + instructions.push_tail(MOV(dst, vec4_result)); + return instructions; } @@ -299,75 +337,20 @@ fs_inst::equals(fs_inst *inst) offset == inst->offset); } -int -fs_inst::regs_written() -{ - if (is_tex()) - return 4; - - /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2, - * but we don't currently use them...nor do we have an opcode for them. - */ - - return 1; -} - bool fs_inst::overwrites_reg(const fs_reg ®) { return (reg.file == dst.file && reg.reg == dst.reg && reg.reg_offset >= dst.reg_offset && - reg.reg_offset < dst.reg_offset + regs_written()); -} - -bool -fs_inst::is_tex() -{ - return (opcode == SHADER_OPCODE_TEX || - opcode == FS_OPCODE_TXB || - opcode == SHADER_OPCODE_TXD || - opcode == SHADER_OPCODE_TXF || - opcode == SHADER_OPCODE_TXL || - opcode == SHADER_OPCODE_TXS); -} - -bool -fs_inst::is_math() -{ - return (opcode == SHADER_OPCODE_RCP || - opcode == SHADER_OPCODE_RSQ || - opcode == SHADER_OPCODE_SQRT || - opcode == SHADER_OPCODE_EXP2 || - opcode == SHADER_OPCODE_LOG2 || - opcode == SHADER_OPCODE_SIN || - opcode == SHADER_OPCODE_COS || - opcode == SHADER_OPCODE_INT_QUOTIENT || - opcode == SHADER_OPCODE_INT_REMAINDER || - opcode == SHADER_OPCODE_POW); -} - -bool -fs_inst::is_control_flow() -{ - switch (opcode) { - case BRW_OPCODE_DO: - case BRW_OPCODE_WHILE: - case BRW_OPCODE_IF: - case BRW_OPCODE_ELSE: - case BRW_OPCODE_ENDIF: - case BRW_OPCODE_BREAK: - case BRW_OPCODE_CONTINUE: - return true; - default: - return false; - } + reg.reg_offset < dst.reg_offset + regs_written); } bool fs_inst::is_send_from_grf() { return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 || + opcode == SHADER_OPCODE_SHADER_TIME_ADD || (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD && src[1].file == GRF)); } @@ -429,7 +412,7 @@ fs_reg::fs_reg(uint32_t u) fs_reg::fs_reg(struct brw_reg fixed_hw_reg) { init(); - this->file = FIXED_HW_REG; + this->file = HW_REG; this->fixed_hw_reg = fixed_hw_reg; this->type = fixed_hw_reg.type; } @@ -468,6 +451,12 @@ fs_reg::is_one() const return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1; } +bool +fs_reg::is_valid_3src() const +{ + return file == GRF || file == UNIFORM; +} + int fs_visitor::type_size(const struct glsl_type *type) { @@ -596,31 +585,18 @@ void fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, fs_reg value) { - /* Choose an index in the buffer and set up tracking information for our - * printouts. - */ - int shader_time_index = brw->shader_time.num_entries++; - assert(shader_time_index <= brw->shader_time.max_entries); - brw->shader_time.types[shader_time_index] = type; - if (prog) { - _mesa_reference_shader_program(ctx, - &brw->shader_time.programs[shader_time_index], - prog); - } - - int base_mrf = 6; + int shader_time_index = + brw_get_shader_time_index(brw, shader_prog, &fp->Base, type); + fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE); - fs_reg offset_mrf = fs_reg(MRF, base_mrf); - offset_mrf.type = BRW_REGISTER_TYPE_UD; - emit(MOV(offset_mrf, fs_reg(shader_time_index * 4))); - - fs_reg time_mrf = fs_reg(MRF, base_mrf + 1); - time_mrf.type = BRW_REGISTER_TYPE_UD; - emit(MOV(time_mrf, value)); + fs_reg payload; + if (dispatch_width == 8) + payload = fs_reg(this, glsl_type::uvec2_type); + else + payload = fs_reg(this, glsl_type::uint_type); - fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD)); - inst->base_mrf = base_mrf; - inst->mlen = 2; + emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD, + fs_reg(), payload, offset, value)); } void @@ -703,6 +679,22 @@ fs_visitor::pop_force_sechalf() assert(force_sechalf_stack >= 0); } +/** + * Returns true if the instruction has a flag that means it won't + * update an entire destination register. + * + * For example, dead code elimination and live variable analysis want to know + * when a write to a variable screens off any preceding values that were in + * it. + */ +bool +fs_inst::is_partial_write() +{ + return (this->predicate || + this->force_uncompressed || + this->force_sechalf); +} + /** * Returns how many MRFs an FS opcode will write over. * @@ -732,18 +724,18 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_MS: case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: return 1; - case SHADER_OPCODE_SHADER_TIME_ADD: - return 0; case FS_OPCODE_FB_WRITE: return 2; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case FS_OPCODE_UNSPILL: return 1; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: - return inst->header_present; + return inst->mlen; case FS_OPCODE_SPILL: return 2; default: @@ -845,8 +837,8 @@ fs_visitor::setup_uniform_values(ir_variable *ir) * with our name, or the prefix of a component that starts with our name. */ unsigned params_before = c->prog_data.nr_params; - for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) { - struct gl_uniform_storage *storage = &prog->UniformStorage[u]; + for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) { + struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u]; if (strncmp(ir->name, storage->name, namelen) != 0 || (storage->name[namelen] != 0 && @@ -868,6 +860,7 @@ fs_visitor::setup_uniform_values(ir_variable *ir) /* Make sure we actually initialized the right amount of stuff here. */ assert(params_before + ir->type->component_slots() == c->prog_data.nr_params); + (void)params_before; } @@ -943,7 +936,7 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) emit(FS_OPCODE_LINTERP, wpos, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - interp_reg(FRAG_ATTRIB_WPOS, 2)); + interp_reg(VARYING_SLOT_POS, 2)); } wpos.reg_offset++; @@ -959,16 +952,24 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, bool is_centroid) { brw_wm_barycentric_interp_mode barycoord_mode; - if (is_centroid) { - if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) - barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; - else - barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; + if (intel->gen >= 6) { + if (is_centroid) { + if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) + barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; + else + barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; + } else { + if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) + barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; + else + barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; + } } else { - if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) - barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; - else - barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; + /* On Ironlake and below, there is only one interpolation mode. + * Centroid interpolation doesn't mean anything on this hardware -- + * there is no multisampling. + */ + barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; } return emit(FS_OPCODE_LINTERP, attr, this->delta_x[barycoord_mode], @@ -1032,30 +1033,24 @@ fs_visitor::emit_general_interpolation(ir_variable *ir) * attribute, as well as making brw_vs_constval.c * handle varyings other than gl_TexCoord. */ - if (location >= FRAG_ATTRIB_TEX0 && - location <= FRAG_ATTRIB_TEX7 && - k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { - emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); - } else { - struct brw_reg interp = interp_reg(location, k); - emit_linterp(attr, fs_reg(interp), interpolation_mode, - ir->centroid); - if (brw->needs_unlit_centroid_workaround && ir->centroid) { - /* Get the pixel/sample mask into f0 so that we know - * which pixels are lit. Then, for each channel that is - * unlit, replace the centroid data with non-centroid - * data. - */ - emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr); - fs_inst *inst = emit_linterp(attr, fs_reg(interp), - interpolation_mode, false); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = true; - } - if (intel->gen < 6) { - emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); - } - } + struct brw_reg interp = interp_reg(location, k); + emit_linterp(attr, fs_reg(interp), interpolation_mode, + ir->centroid); + if (brw->needs_unlit_centroid_workaround && ir->centroid) { + /* Get the pixel/sample mask into f0 so that we know + * which pixels are lit. Then, for each channel that is + * unlit, replace the centroid data with non-centroid + * data. + */ + emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + fs_inst *inst = emit_linterp(attr, fs_reg(interp), + interpolation_mode, false); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + } + if (intel->gen < 6) { + emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); + } attr.reg_offset++; } @@ -1224,7 +1219,7 @@ fs_visitor::assign_curb_setup() constant_nr / 8, constant_nr % 8); - inst->src[i].file = FIXED_HW_REG; + inst->src[i].file = HW_REG; inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); } } @@ -1234,36 +1229,34 @@ fs_visitor::assign_curb_setup() void fs_visitor::calculate_urb_setup() { - for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { urb_setup[i] = -1; } int urb_next = 0; /* Figure out where each of the incoming setup attributes lands. */ if (intel->gen >= 6) { - for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { urb_setup[i] = urb_next++; } } } else { /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ - for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { /* Point size is packed into the header, not as a general attribute */ - if (i == VERT_RESULT_PSIZ) + if (i == VARYING_SLOT_PSIZ) continue; - if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { - int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); - + if (c->key.input_slots_valid & BITFIELD64_BIT(i)) { /* The back color slot is skipped when the front color is * also written to. In addition, some slots can be * written in the vertex shader and not read in the * fragment shader. So the register number must always be * incremented, mapped or not. */ - if (fp_index >= 0) - urb_setup[fp_index] = urb_next; + if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) + urb_setup[i] = urb_next; urb_next++; } } @@ -1274,8 +1267,8 @@ fs_visitor::calculate_urb_setup() * * See compile_sf_prog() for more info. */ - if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC)) - urb_setup[FRAG_ATTRIB_PNTC] = urb_next++; + if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC)) + urb_setup[VARYING_SLOT_PNTC] = urb_next++; } /* Each attribute is 4 setup channels, each of which is half a reg. */ @@ -1294,12 +1287,12 @@ fs_visitor::assign_urb_setup() fs_inst *inst = (fs_inst *)node; if (inst->opcode == FS_OPCODE_LINTERP) { - assert(inst->src[2].file == FIXED_HW_REG); + assert(inst->src[2].file == HW_REG); inst->src[2].fixed_hw_reg.nr += urb_start; } if (inst->opcode == FS_OPCODE_CINTERP) { - assert(inst->src[0].file == FIXED_HW_REG); + assert(inst->src[0].file == HW_REG); inst->src[0].fixed_hw_reg.nr += urb_start; } } @@ -1357,9 +1350,16 @@ fs_visitor::split_virtual_grfs() /* If there's a SEND message that requires contiguous destination * registers, no splitting is allowed. */ - if (inst->regs_written() > 1) { + if (inst->regs_written > 1) { split_grf[inst->dst.reg] = false; } + + /* If we're sending from a GRF, don't split it, on the assumption that + * the send is reading the whole thing. + */ + if (inst->is_send_from_grf()) { + split_grf[inst->src[0].reg] = false; + } } /* Allocate new space for split regs. Note that the virtual @@ -1456,8 +1456,8 @@ fs_visitor::compact_virtual_grfs() remap_table[i] = new_index; virtual_grf_sizes[new_index] = virtual_grf_sizes[i]; if (live_intervals_valid) { - virtual_grf_use[new_index] = virtual_grf_use[i]; - virtual_grf_def[new_index] = virtual_grf_def[i]; + virtual_grf_start[new_index] = virtual_grf_start[i]; + virtual_grf_end[new_index] = virtual_grf_end[i]; } ++new_index; } @@ -1617,15 +1617,13 @@ fs_visitor::move_uniform_array_access_to_pull_constants() base_ir = inst->ir; current_annotation = inst->annotation; - fs_reg offset = fs_reg(this, glsl_type::int_type); - inst->insert_before(ADD(offset, *inst->src[i].reladdr, - fs_reg(pull_constant_loc[uniform] + - inst->src[i].reg_offset))); - fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER); fs_reg temp = fs_reg(this, glsl_type::float_type); exec_list list = VARYING_PULL_CONSTANT_LOAD(temp, - surf_index, offset); + surf_index, + *inst->src[i].reladdr, + pull_constant_loc[uniform] + + inst->src[i].reg_offset); inst->insert_before(&list); inst->src[i].file = temp.file; @@ -1773,10 +1771,8 @@ fs_visitor::opt_algebraic() } /** - * Must be called after calculate_live_intervales() to remove unused - * writes to registers -- register allocation will fail otherwise - * because something deffed but not used won't be considered to - * interfere with other regs. + * Removes any instructions writing a VGRF where that VGRF is not used by any + * later instruction. */ bool fs_visitor::dead_code_eliminate() @@ -1789,9 +1785,12 @@ fs_visitor::dead_code_eliminate() foreach_list_safe(node, &this->instructions) { fs_inst *inst = (fs_inst *)node; - if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { - inst->remove(); - progress = true; + if (inst->dst.file == GRF) { + assert(this->virtual_grf_end[inst->dst.reg] >= pc); + if (this->virtual_grf_end[inst->dst.reg] == pc) { + inst->remove(); + progress = true; + } } pc++; @@ -1803,6 +1802,164 @@ fs_visitor::dead_code_eliminate() return progress; } +struct dead_code_hash_key +{ + int vgrf; + int reg_offset; +}; + +static bool +dead_code_hash_compare(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0; +} + +static void +clear_dead_code_hash(struct hash_table *ht) +{ + struct hash_entry *entry; + + hash_table_foreach(ht, entry) { + _mesa_hash_table_remove(ht, entry); + } +} + +static void +insert_dead_code_hash(struct hash_table *ht, + int vgrf, int reg_offset, fs_inst *inst) +{ + /* We don't bother freeing keys, because they'll be GCed with the ht. */ + struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key); + + key->vgrf = vgrf; + key->reg_offset = reg_offset; + + _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst); +} + +static struct hash_entry * +get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset) +{ + struct dead_code_hash_key key; + + key.vgrf = vgrf; + key.reg_offset = reg_offset; + + return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key); +} + +static void +remove_dead_code_hash(struct hash_table *ht, + int vgrf, int reg_offset) +{ + struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset); + if (!entry) + return; + + _mesa_hash_table_remove(ht, entry); +} + +/** + * Walks basic blocks, removing any regs that are written but not read before + * being redefined. + * + * The dead_code_eliminate() function implements a global dead code + * elimination, but it only handles the removing the last write to a register + * if it's never read. This one can handle intermediate writes, but only + * within a basic block. + */ +bool +fs_visitor::dead_code_eliminate_local() +{ + struct hash_table *ht; + bool progress = false; + + ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare); + + foreach_list_safe(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + /* At a basic block, empty the HT since we don't understand dataflow + * here. + */ + if (inst->is_control_flow()) { + clear_dead_code_hash(ht); + continue; + } + + /* Clear the HT of any instructions that got read. */ + for (int i = 0; i < 3; i++) { + fs_reg src = inst->src[i]; + if (src.file != GRF) + continue; + + int read = 1; + if (inst->is_send_from_grf()) + read = virtual_grf_sizes[src.reg] - src.reg_offset; + + for (int reg_offset = src.reg_offset; + reg_offset < src.reg_offset + read; + reg_offset++) { + remove_dead_code_hash(ht, src.reg, reg_offset); + } + } + + /* Add any update of a GRF to the HT, removing a previous write if it + * wasn't read. + */ + if (inst->dst.file == GRF) { + if (inst->regs_written > 1) { + /* We don't know how to trim channels from an instruction's + * writes, so we can't incrementally remove unread channels from + * it. Just remove whatever it overwrites from the table + */ + for (int i = 0; i < inst->regs_written; i++) { + remove_dead_code_hash(ht, + inst->dst.reg, + inst->dst.reg_offset + i); + } + } else { + struct hash_entry *entry = + get_dead_code_hash_entry(ht, inst->dst.reg, + inst->dst.reg_offset); + + if (inst->is_partial_write()) { + /* For a partial write, we can't remove any previous dead code + * candidate, since we're just modifying their result, but we can + * be dead code eliminiated ourselves. + */ + if (entry) { + entry->data = inst; + } else { + insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset, + inst); + } + } else { + if (entry) { + /* We're completely updating a channel, and there was a + * previous write to the channel that wasn't read. Kill it! + */ + fs_inst *inst = (fs_inst *)entry->data; + inst->remove(); + progress = true; + _mesa_hash_table_remove(ht, entry); + } + + insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset, + inst); + } + } + } + } + + _mesa_hash_table_destroy(ht, NULL); + + if (progress) + live_intervals_valid = false; + + return progress; +} + /** * Implements a second type of register coalescing: This one checks if * the two regs involved in a raw move don't interfere, in which case @@ -1819,7 +1976,7 @@ fs_visitor::register_coalesce_2() fs_inst *inst = (fs_inst *)node; if (inst->opcode != BRW_OPCODE_MOV || - inst->predicate || + inst->is_partial_write() || inst->saturate || inst->src[0].file != GRF || inst->src[0].negate || @@ -1920,7 +2077,7 @@ fs_visitor::register_coalesce() continue; if (inst->opcode != BRW_OPCODE_MOV || - inst->predicate || + inst->is_partial_write() || inst->saturate || inst->dst.file != GRF || (inst->src[0].file != GRF && inst->src[0].file != UNIFORM)|| @@ -1929,6 +2086,7 @@ fs_visitor::register_coalesce() bool has_source_modifiers = (inst->src[0].abs || inst->src[0].negate || + inst->src[0].smear != -1 || inst->src[0].file == UNIFORM); /* Found a move of a GRF to a GRF. Let's see if we can coalesce @@ -2021,7 +2179,7 @@ fs_visitor::compute_to_mrf() next_ip++; if (inst->opcode != BRW_OPCODE_MOV || - inst->predicate || + inst->is_partial_write() || inst->dst.file != MRF || inst->src[0].file != GRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) @@ -2044,7 +2202,7 @@ fs_visitor::compute_to_mrf() /* Can't compute-to-MRF this GRF if someone else was going to * read it later. */ - if (this->virtual_grf_use[inst->src[0].reg] > ip) + if (this->virtual_grf_end[inst->src[0].reg] > ip) continue; /* Found a move of a GRF to a MRF. Let's see if we can go @@ -2060,32 +2218,25 @@ fs_visitor::compute_to_mrf() * into a compute-to-MRF. */ - /* SENDs can only write to GRFs, so no compute-to-MRF. */ - if (scan_inst->mlen) { - break; - } - - /* If it's predicated, it (probably) didn't populate all - * the channels. We might be able to rewrite everything + /* If this one instruction didn't populate all the + * channels, bail. We might be able to rewrite everything * that writes that reg, but it would require smarter * tracking to delay the rewriting until complete success. */ - if (scan_inst->predicate) + if (scan_inst->is_partial_write()) break; - /* If it's half of register setup and not the same half as - * our MOV we're trying to remove, bail for now. - */ - if (scan_inst->force_uncompressed != inst->force_uncompressed || - scan_inst->force_sechalf != inst->force_sechalf) { - break; - } + /* Things returning more than one register would need us to + * understand coalescing out more than one MOV at a time. + */ + if (scan_inst->regs_written > 1) + break; /* SEND instructions can't have MRF as a destination. */ if (scan_inst->mlen) break; - if (intel->gen >= 6) { + if (intel->gen == 6) { /* gen6 math instructions must have the destination be * GRF, so no compute-to-MRF for them. */ @@ -2235,7 +2386,7 @@ fs_visitor::remove_duplicate_mrf_writes() if (inst->opcode == BRW_OPCODE_MOV && inst->dst.file == MRF && inst->src[0].file == GRF && - !inst->predicate) { + !inst->is_partial_write()) { last_mrf_move[inst->dst.reg] = inst; } } @@ -2259,7 +2410,7 @@ clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps, int grf; if (inst->src[i].file == GRF) { grf = inst->src[i].reg; - } else if (inst->src[i].file == FIXED_HW_REG && + } else if (inst->src[i].file == HW_REG && inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { grf = inst->src[i].fixed_hw_reg.nr; } else { @@ -2294,7 +2445,8 @@ clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps, void fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) { - int write_len = inst->regs_written() * dispatch_width / 8; + int reg_size = dispatch_width / 8; + int write_len = inst->regs_written * reg_size; int first_write_grf = inst->dst.reg; bool needs_dep[BRW_MAX_MRF]; assert(write_len < (int)sizeof(needs_dep) - 1); @@ -2323,6 +2475,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); } } + return; } bool scan_inst_16wide = (dispatch_width > 8 && @@ -2333,14 +2486,19 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) * instruction but a MOV that might have left us an outstanding * dependency has more latency than a MOV. */ - if (scan_inst->dst.file == GRF && - scan_inst->dst.reg >= first_write_grf && - scan_inst->dst.reg < first_write_grf + write_len && - needs_dep[scan_inst->dst.reg - first_write_grf]) { - inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg)); - needs_dep[scan_inst->dst.reg - first_write_grf] = false; - if (scan_inst_16wide) - needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false; + if (scan_inst->dst.file == GRF) { + for (int i = 0; i < scan_inst->regs_written; i++) { + int reg = scan_inst->dst.reg + i * reg_size; + + if (reg >= first_write_grf && + reg < first_write_grf + write_len && + needs_dep[reg - first_write_grf]) { + inst->insert_before(DEP_RESOLVE_MOV(reg)); + needs_dep[reg - first_write_grf] = false; + if (scan_inst_16wide) + needs_dep[reg - first_write_grf + 1] = false; + } + } } /* Clear the flag for registers that actually got read (as expected). */ @@ -2368,7 +2526,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) void fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst) { - int write_len = inst->regs_written() * dispatch_width / 8; + int write_len = inst->regs_written * dispatch_width / 8; int first_write_grf = inst->dst.reg; bool needs_dep[BRW_MAX_MRF]; assert(write_len < (int)sizeof(needs_dep) - 1); @@ -2387,6 +2545,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst) if (needs_dep[i]) scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); } + return; } /* Clear the flag for registers that actually got read (as expected). */ @@ -2455,6 +2614,11 @@ fs_visitor::insert_gen4_send_dependency_workarounds() * scheduling full flexibility, while the conversion to native instructions * allows the post-register-allocation scheduler the best information * possible. + * + * Note that execution masking for setting up pull constant loads is special: + * the channels that need to be written are unrelated to the current execution + * mask, since a later instruction will use one of the result channels as a + * source operand for all 8 or 16 of its channels. */ void fs_visitor::lower_uniform_pull_constant_loads() @@ -2466,33 +2630,36 @@ fs_visitor::lower_uniform_pull_constant_loads() continue; if (intel->gen >= 7) { + /* The offset arg before was a vec4-aligned byte offset. We need to + * turn it into a dword offset. + */ fs_reg const_offset_reg = inst->src[1]; assert(const_offset_reg.file == IMM && const_offset_reg.type == BRW_REGISTER_TYPE_UD); - const_offset_reg.imm.u /= 16; + const_offset_reg.imm.u /= 4; fs_reg payload = fs_reg(this, glsl_type::uint_type); - struct brw_reg g0 = retype(brw_vec8_grf(0, 0), - BRW_REGISTER_TYPE_UD); - - fs_inst *setup1 = MOV(payload, fs_reg(g0)); - setup1->force_writemask_all = true; - /* We don't need the second half of this vgrf to be filled with g1 - * in the 16-wide case, but if we use force_uncompressed then live - * variable analysis won't consider this a def! + + /* This is actually going to be a MOV, but since only the first dword + * is accessed, we have a special opcode to do just that one. Note + * that this needs to be an operation that will be considered a def + * by live variable analysis, or register allocation will explode. */ + fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET, + payload, const_offset_reg); + setup->force_writemask_all = true; - fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET, - payload, payload, - const_offset_reg); + setup->ir = inst->ir; + setup->annotation = inst->annotation; + inst->insert_before(setup); - setup1->ir = inst->ir; - setup1->annotation = inst->annotation; - inst->insert_before(setup1); - setup2->ir = inst->ir; - setup2->annotation = inst->annotation; - inst->insert_before(setup2); + /* Similarly, this will only populate the first 4 channels of the + * result register (since we only use smear values from 0-3), but we + * don't tell the optimizer. + */ inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; inst->src[1] = payload; + + this->live_intervals_valid = false; } else { /* Before register allocation, we didn't tell the scheduler about the * MRF we use. We know it's safe to use this MRF because nothing @@ -2506,33 +2673,17 @@ fs_visitor::lower_uniform_pull_constant_loads() } void -fs_visitor::dump_instruction(fs_inst *inst) +fs_visitor::dump_instruction(backend_instruction *be_inst) { + fs_inst *inst = (fs_inst *)be_inst; + if (inst->predicate) { printf("(%cf0.%d) ", inst->predicate_inverse ? '-' : '+', inst->flag_subreg); } - if (inst->opcode < ARRAY_SIZE(opcode_descs) && - opcode_descs[inst->opcode].name) { - printf("%s", opcode_descs[inst->opcode].name); - } else { - switch (inst->opcode) { - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - printf("uniform_pull_const"); - break; - case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: - printf("uniform_pull_const_gen7"); - break; - case FS_OPCODE_SET_GLOBAL_OFFSET: - printf("set_global_offset"); - break; - default: - printf("op%d", inst->opcode); - break; - } - } + printf("%s", brw_instruction_name(inst->opcode)); if (inst->saturate) printf(".sat"); if (inst->conditional_mod) { @@ -2628,17 +2779,6 @@ fs_visitor::dump_instruction(fs_inst *inst) printf("\n"); } -void -fs_visitor::dump_instructions() -{ - int ip = 0; - foreach_list(node, &this->instructions) { - fs_inst *inst = (fs_inst *)node; - printf("%d: ", ip++); - dump_instruction(inst); - } -} - /** * Possibly returns an instruction that set up @param reg. * @@ -2658,9 +2798,7 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, fs_reg reg) { if (end == start || - end->predicate || - end->force_uncompressed || - end->force_sechalf || + end->is_partial_write() || reg.reladdr || !reg.equals(end->dst)) { return NULL; @@ -2672,9 +2810,8 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, void fs_visitor::setup_payload_gen6() { - struct intel_context *intel = &brw->intel; bool uses_depth = - (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0; + (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0; unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes; assert(intel->gen >= 6); @@ -2775,8 +2912,7 @@ fs_visitor::run() if (failed) return false; - if (INTEL_DEBUG & DEBUG_SHADER_TIME) - emit_shader_time_end(); + emit(FS_OPCODE_PLACEHOLDER_HALT); emit_fb_writes(); @@ -2797,6 +2933,7 @@ fs_visitor::run() progress = opt_cse() || progress; progress = opt_copy_propagate() || progress; progress = dead_code_eliminate() || progress; + progress = dead_code_eliminate_local() || progress; progress = register_coalesce() || progress; progress = register_coalesce_2() || progress; progress = compute_to_mrf() || progress; @@ -2871,7 +3008,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, bool start_busy = false; float start_time = 0; - if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { + if (unlikely(intel->perf_debug)) { start_busy = (intel->batch.last_bo && drm_intel_bo_busy(intel->batch.last_bo)); start_time = get_time(); @@ -2882,7 +3019,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; if (unlikely(INTEL_DEBUG & DEBUG_WM)) { - if (shader) { + if (prog) { printf("GLSL IR for native fragment shader %d:\n", prog->Name); _mesa_print_ir(shader->ir, NULL); printf("\n\n"); @@ -2897,11 +3034,13 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, */ fs_visitor v(brw, c, prog, fp, 8); if (!v.run()) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); + if (prog) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, v.fail_msg); + } _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", - v.fail_msg); + v.fail_msg); return NULL; } @@ -2926,7 +3065,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, simd16_instructions, final_assembly_size); - if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) { + if (unlikely(intel->perf_debug) && shader) { if (shader->compiled_once) brw_wm_debug_recompile(brw, prog, &c->key); shader->compiled_once = true; @@ -2969,28 +3108,20 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; } - if (prog->Name != 0) - key.proj_attrib_mask = 0xffffffff; - if (intel->gen < 6) - key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); + key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS); - for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { + for (int i = 0; i < VARYING_SLOT_MAX; i++) { if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) continue; - if (prog->Name == 0) - key.proj_attrib_mask |= 1 << i; - if (intel->gen < 6) { - int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); - - if (vp_index >= 0) - key.vp_outputs_written |= BITFIELD64_BIT(vp_index); + if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) + key.input_slots_valid |= BITFIELD64_BIT(i); } } - key.clamp_fragment_color = true; + key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT; for (int i = 0; i < MAX_SAMPLERS; i++) { if (fp->Base.ShadowSamplers & (1 << i)) { @@ -3003,11 +3134,11 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) } } - if (fp->Base.InputsRead & FRAG_BIT_WPOS) { + if (fp->Base.InputsRead & VARYING_BIT_POS) { key.drawable_height = ctx->DrawBuffer->Height; } - if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) { + if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) { key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); }