X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs.cpp;h=baaa25c13473171ae963b150c021cd92f908653b;hb=e290372542d0475e612e4d10a27b22eae3158ecd;hp=fcde3dad1b3a6c69ba915760f0edc69080fb004e;hpb=d5efc14635cf25bc130bfa77737913913d9202ce;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index fcde3dad1b3..baaa25c1347 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -32,6 +32,7 @@ extern "C" { #include +#include "main/hash_table.h" #include "main/macros.h" #include "main/shaderobj.h" #include "main/uniforms.h" @@ -60,6 +61,9 @@ fs_inst::init() this->src[0] = reg_undef; this->src[1] = reg_undef; this->src[2] = reg_undef; + + /* This will be the case for almost all instructions. */ + this->regs_written = 1; } fs_inst::fs_inst() @@ -146,6 +150,13 @@ fs_inst::fs_inst(enum opcode opcode, fs_reg dst, return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \ } +#define ALU3(op) \ + fs_inst * \ + fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \ + { \ + return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\ + } + ALU1(NOT) ALU1(MOV) ALU1(FRC) @@ -161,6 +172,14 @@ ALU2(XOR) ALU2(SHL) ALU2(SHR) ALU2(ASR) +ALU3(LRP) +ALU1(BFREV) +ALU3(BFE) +ALU2(BFI1) +ALU3(BFI2) +ALU1(FBH) +ALU1(FBL) +ALU1(CBIT) /** Gen4 predicated IF. */ fs_inst * @@ -206,7 +225,7 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition) */ if (intel->gen == 4) { dst.type = src0.type; - if (dst.file == FIXED_HW_REG) + if (dst.file == HW_REG) dst.fixed_hw_reg.type = dst.type; } @@ -221,43 +240,82 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition) exec_list fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index, - fs_reg offset) + fs_reg varying_offset, + uint32_t const_offset) { exec_list instructions; fs_inst *inst; - if (intel->gen >= 7) { - inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, - dst, surf_index, offset); - instructions.push_tail(inst); - } else { - int base_mrf = 13; - bool header_present = true; - - fs_reg mrf = fs_reg(MRF, base_mrf + header_present); - mrf.type = BRW_REGISTER_TYPE_D; - - /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a - * dword-aligned byte offset. + /* We have our constant surface use a pitch of 4 bytes, so our index can + * be any component of a vector, and then we load 4 contiguous + * components starting from that. + * + * We break down the const_offset to a portion added to the variable + * offset and a portion done using reg_offset, which means that if you + * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = + * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and + * CSE can later notice that those loads are all the same and eliminate + * the redundant ones. + */ + fs_reg vec4_offset = fs_reg(this, glsl_type::int_type); + instructions.push_tail(ADD(vec4_offset, + varying_offset, const_offset & ~3)); + + int scale = 1; + if (intel->gen == 4 && dispatch_width == 8) { + /* Pre-gen5, we can either use a SIMD8 message that requires (header, + * u, v, r) as parameters, or we can just use the SIMD16 message + * consisting of (header, u). We choose the second, at the cost of a + * longer return length. */ - if (intel->gen == 6) { - instructions.push_tail(MOV(mrf, offset)); - } else { - instructions.push_tail(MUL(mrf, offset, fs_reg(4))); - } - inst = MOV(mrf, offset); - inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD, - dst, surf_index); - inst->header_present = header_present; - inst->base_mrf = base_mrf; - inst->mlen = header_present + dispatch_width / 8; + scale = 2; + } - instructions.push_tail(inst); + enum opcode op; + if (intel->gen >= 7) + op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; + else + op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; + fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type); + inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset); + inst->regs_written = 4 * scale; + instructions.push_tail(inst); + + if (intel->gen < 7) { + inst->base_mrf = 13; + inst->header_present = true; + if (intel->gen == 4) + inst->mlen = 3; + else + inst->mlen = 1 + dispatch_width / 8; } + vec4_result.reg_offset += (const_offset & 3) * scale; + instructions.push_tail(MOV(dst, vec4_result)); + return instructions; } +/** + * A helper for MOV generation for fixing up broken hardware SEND dependency + * handling. + */ +fs_inst * +fs_visitor::DEP_RESOLVE_MOV(int grf) +{ + fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F)); + + inst->ir = NULL; + inst->annotation = "send dependency resolve"; + + /* The caller always wants uncompressed to emit the minimal extra + * dependencies, and to avoid having to deal with aligning its regs to 2. + */ + inst->force_uncompressed = true; + + return inst; +} + bool fs_inst::equals(fs_inst *inst) { @@ -279,58 +337,20 @@ fs_inst::equals(fs_inst *inst) offset == inst->offset); } -int -fs_inst::regs_written() -{ - if (is_tex()) - return 4; - - /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2, - * but we don't currently use them...nor do we have an opcode for them. - */ - - return 1; -} - bool fs_inst::overwrites_reg(const fs_reg ®) { return (reg.file == dst.file && reg.reg == dst.reg && reg.reg_offset >= dst.reg_offset && - reg.reg_offset < dst.reg_offset + regs_written()); -} - -bool -fs_inst::is_tex() -{ - return (opcode == SHADER_OPCODE_TEX || - opcode == FS_OPCODE_TXB || - opcode == SHADER_OPCODE_TXD || - opcode == SHADER_OPCODE_TXF || - opcode == SHADER_OPCODE_TXL || - opcode == SHADER_OPCODE_TXS); -} - -bool -fs_inst::is_math() -{ - return (opcode == SHADER_OPCODE_RCP || - opcode == SHADER_OPCODE_RSQ || - opcode == SHADER_OPCODE_SQRT || - opcode == SHADER_OPCODE_EXP2 || - opcode == SHADER_OPCODE_LOG2 || - opcode == SHADER_OPCODE_SIN || - opcode == SHADER_OPCODE_COS || - opcode == SHADER_OPCODE_INT_QUOTIENT || - opcode == SHADER_OPCODE_INT_REMAINDER || - opcode == SHADER_OPCODE_POW); + reg.reg_offset < dst.reg_offset + regs_written); } bool fs_inst::is_send_from_grf() { return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 || + opcode == SHADER_OPCODE_SHADER_TIME_ADD || (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD && src[1].file == GRF)); } @@ -392,7 +412,7 @@ fs_reg::fs_reg(uint32_t u) fs_reg::fs_reg(struct brw_reg fixed_hw_reg) { init(); - this->file = FIXED_HW_REG; + this->file = HW_REG; this->fixed_hw_reg = fixed_hw_reg; this->type = fixed_hw_reg.type; } @@ -431,6 +451,12 @@ fs_reg::is_one() const return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1; } +bool +fs_reg::is_valid_3src() const +{ + return file == GRF || file == UNIFORM; +} + int fs_visitor::type_size(const struct glsl_type *type) { @@ -455,10 +481,14 @@ fs_visitor::type_size(const struct glsl_type *type) * link time. */ return 0; - default: + case GLSL_TYPE_VOID: + case GLSL_TYPE_ERROR: + case GLSL_TYPE_INTERFACE: assert(!"not reached"); - return 0; + break; } + + return 0; } fs_reg @@ -555,31 +585,18 @@ void fs_visitor::emit_shader_time_write(enum shader_time_shader_type type, fs_reg value) { - /* Choose an index in the buffer and set up tracking information for our - * printouts. - */ - int shader_time_index = brw->shader_time.num_entries++; - assert(shader_time_index <= brw->shader_time.max_entries); - brw->shader_time.types[shader_time_index] = type; - if (prog) { - _mesa_reference_shader_program(ctx, - &brw->shader_time.programs[shader_time_index], - prog); - } - - int base_mrf = 6; + int shader_time_index = + brw_get_shader_time_index(brw, shader_prog, &fp->Base, type); + fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE); - fs_reg offset_mrf = fs_reg(MRF, base_mrf); - offset_mrf.type = BRW_REGISTER_TYPE_UD; - emit(MOV(offset_mrf, fs_reg(shader_time_index * 4))); - - fs_reg time_mrf = fs_reg(MRF, base_mrf + 1); - time_mrf.type = BRW_REGISTER_TYPE_UD; - emit(MOV(time_mrf, value)); + fs_reg payload; + if (dispatch_width == 8) + payload = fs_reg(this, glsl_type::uvec2_type); + else + payload = fs_reg(this, glsl_type::uint_type); - fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD)); - inst->base_mrf = base_mrf; - inst->mlen = 2; + emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD, + fs_reg(), payload, offset, value)); } void @@ -662,6 +679,22 @@ fs_visitor::pop_force_sechalf() assert(force_sechalf_stack >= 0); } +/** + * Returns true if the instruction has a flag that means it won't + * update an entire destination register. + * + * For example, dead code elimination and live variable analysis want to know + * when a write to a variable screens off any preceding values that were in + * it. + */ +bool +fs_inst::is_partial_write() +{ + return (this->predicate || + this->force_uncompressed || + this->force_sechalf); +} + /** * Returns how many MRFs an FS opcode will write over. * @@ -691,18 +724,18 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_MS: case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: return 1; - case SHADER_OPCODE_SHADER_TIME_ADD: - return 0; case FS_OPCODE_FB_WRITE: return 2; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: case FS_OPCODE_UNSPILL: return 1; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: - return inst->header_present; + return inst->mlen; case FS_OPCODE_SPILL: return 2; default: @@ -792,57 +825,42 @@ fs_visitor::import_uniforms(fs_visitor *v) * get stored, rather than in some global gl_shader_program uniform * store. */ -int -fs_visitor::setup_uniform_values(int loc, const glsl_type *type) +void +fs_visitor::setup_uniform_values(ir_variable *ir) { - unsigned int offset = 0; - - if (type->is_matrix()) { - const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, - type->vector_elements, - 1); + int namelen = strlen(ir->name); - for (unsigned int i = 0; i < type->matrix_columns; i++) { - offset += setup_uniform_values(loc + offset, column); + /* The data for our (non-builtin) uniforms is stored in a series of + * gl_uniform_driver_storage structs for each subcomponent that + * glGetUniformLocation() could name. We know it's been set up in the same + * order we'd walk the type, so walk the list of storage and find anything + * with our name, or the prefix of a component that starts with our name. + */ + unsigned params_before = c->prog_data.nr_params; + for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) { + struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u]; + + if (strncmp(ir->name, storage->name, namelen) != 0 || + (storage->name[namelen] != 0 && + storage->name[namelen] != '.' && + storage->name[namelen] != '[')) { + continue; } - return offset; - } - - switch (type->base_type) { - case GLSL_TYPE_FLOAT: - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - case GLSL_TYPE_BOOL: - for (unsigned int i = 0; i < type->vector_elements; i++) { - unsigned int param = c->prog_data.nr_params++; + unsigned slots = storage->type->component_slots(); + if (storage->array_elements) + slots *= storage->array_elements; - this->param_index[param] = loc; - this->param_offset[param] = i; + for (unsigned i = 0; i < slots; i++) { + c->prog_data.param[c->prog_data.nr_params++] = + &storage->storage[i].f; } - return 1; - - case GLSL_TYPE_STRUCT: - for (unsigned int i = 0; i < type->length; i++) { - offset += setup_uniform_values(loc + offset, - type->fields.structure[i].type); - } - return offset; - - case GLSL_TYPE_ARRAY: - for (unsigned int i = 0; i < type->length; i++) { - offset += setup_uniform_values(loc + offset, type->fields.array); - } - return offset; - - case GLSL_TYPE_SAMPLER: - /* The sampler takes up a slot, but we don't use any values from it. */ - return 1; - - default: - assert(!"not reached"); - return 0; } + + /* Make sure we actually initialized the right amount of stuff here. */ + assert(params_before + ir->type->component_slots() == + c->prog_data.nr_params); + (void)params_before; } @@ -874,9 +892,8 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir) break; last_swiz = swiz; - this->param_index[c->prog_data.nr_params] = index; - this->param_offset[c->prog_data.nr_params] = swiz; - c->prog_data.nr_params++; + c->prog_data.param[c->prog_data.nr_params++] = + &fp->Base.Parameters->ParameterValues[index][swiz].f; } } } @@ -919,7 +936,7 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) emit(FS_OPCODE_LINTERP, wpos, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], - interp_reg(FRAG_ATTRIB_WPOS, 2)); + interp_reg(VARYING_SLOT_POS, 2)); } wpos.reg_offset++; @@ -935,16 +952,24 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, bool is_centroid) { brw_wm_barycentric_interp_mode barycoord_mode; - if (is_centroid) { - if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) - barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; - else - barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; + if (intel->gen >= 6) { + if (is_centroid) { + if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) + barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; + else + barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; + } else { + if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) + barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; + else + barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; + } } else { - if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) - barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; - else - barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; + /* On Ironlake and below, there is only one interpolation mode. + * Centroid interpolation doesn't mean anything on this hardware -- + * there is no multisampling. + */ + barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; } return emit(FS_OPCODE_LINTERP, attr, this->delta_x[barycoord_mode], @@ -1008,30 +1033,24 @@ fs_visitor::emit_general_interpolation(ir_variable *ir) * attribute, as well as making brw_vs_constval.c * handle varyings other than gl_TexCoord. */ - if (location >= FRAG_ATTRIB_TEX0 && - location <= FRAG_ATTRIB_TEX7 && - k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { - emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); - } else { - struct brw_reg interp = interp_reg(location, k); - emit_linterp(attr, fs_reg(interp), interpolation_mode, - ir->centroid); - if (brw->needs_unlit_centroid_workaround && ir->centroid) { - /* Get the pixel/sample mask into f0 so that we know - * which pixels are lit. Then, for each channel that is - * unlit, replace the centroid data with non-centroid - * data. - */ - emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr); - fs_inst *inst = emit_linterp(attr, fs_reg(interp), - interpolation_mode, false); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->predicate_inverse = true; - } - if (intel->gen < 6) { - emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); - } - } + struct brw_reg interp = interp_reg(location, k); + emit_linterp(attr, fs_reg(interp), interpolation_mode, + ir->centroid); + if (brw->needs_unlit_centroid_workaround && ir->centroid) { + /* Get the pixel/sample mask into f0 so that we know + * which pixels are lit. Then, for each channel that is + * unlit, replace the centroid data with non-centroid + * data. + */ + emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); + fs_inst *inst = emit_linterp(attr, fs_reg(interp), + interpolation_mode, false); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + } + if (intel->gen < 6) { + emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); + } attr.reg_offset++; } @@ -1179,25 +1198,6 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) return inst; } -/** - * To be called after the last _mesa_add_state_reference() call, to - * set up prog_data.param[] for assign_curb_setup() and - * setup_pull_constants(). - */ -void -fs_visitor::setup_paramvalues_refs() -{ - if (dispatch_width != 8) - return; - - /* Set up the pointers to ParamValues now that that array is finalized. */ - for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { - c->prog_data.param[i] = - (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + - this->param_offset[i]; - } -} - void fs_visitor::assign_curb_setup() { @@ -1219,7 +1219,7 @@ fs_visitor::assign_curb_setup() constant_nr / 8, constant_nr % 8); - inst->src[i].file = FIXED_HW_REG; + inst->src[i].file = HW_REG; inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); } } @@ -1229,36 +1229,34 @@ fs_visitor::assign_curb_setup() void fs_visitor::calculate_urb_setup() { - for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { urb_setup[i] = -1; } int urb_next = 0; /* Figure out where each of the incoming setup attributes lands. */ if (intel->gen >= 6) { - for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { urb_setup[i] = urb_next++; } } } else { /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ - for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { /* Point size is packed into the header, not as a general attribute */ - if (i == VERT_RESULT_PSIZ) + if (i == VARYING_SLOT_PSIZ) continue; - if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { - int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); - + if (c->key.input_slots_valid & BITFIELD64_BIT(i)) { /* The back color slot is skipped when the front color is * also written to. In addition, some slots can be * written in the vertex shader and not read in the * fragment shader. So the register number must always be * incremented, mapped or not. */ - if (fp_index >= 0) - urb_setup[fp_index] = urb_next; + if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) + urb_setup[i] = urb_next; urb_next++; } } @@ -1269,8 +1267,8 @@ fs_visitor::calculate_urb_setup() * * See compile_sf_prog() for more info. */ - if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC)) - urb_setup[FRAG_ATTRIB_PNTC] = urb_next++; + if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC)) + urb_setup[VARYING_SLOT_PNTC] = urb_next++; } /* Each attribute is 4 setup channels, each of which is half a reg. */ @@ -1289,12 +1287,12 @@ fs_visitor::assign_urb_setup() fs_inst *inst = (fs_inst *)node; if (inst->opcode == FS_OPCODE_LINTERP) { - assert(inst->src[2].file == FIXED_HW_REG); + assert(inst->src[2].file == HW_REG); inst->src[2].fixed_hw_reg.nr += urb_start; } if (inst->opcode == FS_OPCODE_CINTERP) { - assert(inst->src[0].file == FIXED_HW_REG); + assert(inst->src[0].file == HW_REG); inst->src[0].fixed_hw_reg.nr += urb_start; } } @@ -1352,9 +1350,16 @@ fs_visitor::split_virtual_grfs() /* If there's a SEND message that requires contiguous destination * registers, no splitting is allowed. */ - if (inst->regs_written() > 1) { + if (inst->regs_written > 1) { split_grf[inst->dst.reg] = false; } + + /* If we're sending from a GRF, don't split it, on the assumption that + * the send is reading the whole thing. + */ + if (inst->is_send_from_grf()) { + split_grf[inst->src[0].reg] = false; + } } /* Allocate new space for split regs. Note that the virtual @@ -1451,8 +1456,8 @@ fs_visitor::compact_virtual_grfs() remap_table[i] = new_index; virtual_grf_sizes[new_index] = virtual_grf_sizes[i]; if (live_intervals_valid) { - virtual_grf_use[new_index] = virtual_grf_use[i]; - virtual_grf_def[new_index] = virtual_grf_def[i]; + virtual_grf_start[new_index] = virtual_grf_start[i]; + virtual_grf_end[new_index] = virtual_grf_end[i]; } ++new_index; } @@ -1528,9 +1533,6 @@ fs_visitor::remove_dead_constants() if (remapped == -1) continue; - /* We've already done setup_paramvalues_refs() so no need to worry - * about param_index and param_offset. - */ c->prog_data.param[remapped] = c->prog_data.param[i]; } @@ -1615,15 +1617,13 @@ fs_visitor::move_uniform_array_access_to_pull_constants() base_ir = inst->ir; current_annotation = inst->annotation; - fs_reg offset = fs_reg(this, glsl_type::int_type); - inst->insert_before(ADD(offset, *inst->src[i].reladdr, - fs_reg(pull_constant_loc[uniform] + - inst->src[i].reg_offset))); - fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER); fs_reg temp = fs_reg(this, glsl_type::float_type); exec_list list = VARYING_PULL_CONSTANT_LOAD(temp, - surf_index, offset); + surf_index, + *inst->src[i].reladdr, + pull_constant_loc[uniform] + + inst->src[i].reg_offset); inst->insert_before(&list); inst->src[i].file = temp.file; @@ -1708,8 +1708,6 @@ fs_visitor::setup_pull_constants() dst, index, offset); pull->ir = inst->ir; pull->annotation = inst->annotation; - pull->base_mrf = 14; - pull->mlen = 1; inst->insert_before(pull); @@ -1773,10 +1771,8 @@ fs_visitor::opt_algebraic() } /** - * Must be called after calculate_live_intervales() to remove unused - * writes to registers -- register allocation will fail otherwise - * because something deffed but not used won't be considered to - * interfere with other regs. + * Removes any instructions writing a VGRF where that VGRF is not used by any + * later instruction. */ bool fs_visitor::dead_code_eliminate() @@ -1789,9 +1785,12 @@ fs_visitor::dead_code_eliminate() foreach_list_safe(node, &this->instructions) { fs_inst *inst = (fs_inst *)node; - if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { - inst->remove(); - progress = true; + if (inst->dst.file == GRF) { + assert(this->virtual_grf_end[inst->dst.reg] >= pc); + if (this->virtual_grf_end[inst->dst.reg] == pc) { + inst->remove(); + progress = true; + } } pc++; @@ -1803,6 +1802,164 @@ fs_visitor::dead_code_eliminate() return progress; } +struct dead_code_hash_key +{ + int vgrf; + int reg_offset; +}; + +static bool +dead_code_hash_compare(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0; +} + +static void +clear_dead_code_hash(struct hash_table *ht) +{ + struct hash_entry *entry; + + hash_table_foreach(ht, entry) { + _mesa_hash_table_remove(ht, entry); + } +} + +static void +insert_dead_code_hash(struct hash_table *ht, + int vgrf, int reg_offset, fs_inst *inst) +{ + /* We don't bother freeing keys, because they'll be GCed with the ht. */ + struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key); + + key->vgrf = vgrf; + key->reg_offset = reg_offset; + + _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst); +} + +static struct hash_entry * +get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset) +{ + struct dead_code_hash_key key; + + key.vgrf = vgrf; + key.reg_offset = reg_offset; + + return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key); +} + +static void +remove_dead_code_hash(struct hash_table *ht, + int vgrf, int reg_offset) +{ + struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset); + if (!entry) + return; + + _mesa_hash_table_remove(ht, entry); +} + +/** + * Walks basic blocks, removing any regs that are written but not read before + * being redefined. + * + * The dead_code_eliminate() function implements a global dead code + * elimination, but it only handles the removing the last write to a register + * if it's never read. This one can handle intermediate writes, but only + * within a basic block. + */ +bool +fs_visitor::dead_code_eliminate_local() +{ + struct hash_table *ht; + bool progress = false; + + ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare); + + foreach_list_safe(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + /* At a basic block, empty the HT since we don't understand dataflow + * here. + */ + if (inst->is_control_flow()) { + clear_dead_code_hash(ht); + continue; + } + + /* Clear the HT of any instructions that got read. */ + for (int i = 0; i < 3; i++) { + fs_reg src = inst->src[i]; + if (src.file != GRF) + continue; + + int read = 1; + if (inst->is_send_from_grf()) + read = virtual_grf_sizes[src.reg] - src.reg_offset; + + for (int reg_offset = src.reg_offset; + reg_offset < src.reg_offset + read; + reg_offset++) { + remove_dead_code_hash(ht, src.reg, reg_offset); + } + } + + /* Add any update of a GRF to the HT, removing a previous write if it + * wasn't read. + */ + if (inst->dst.file == GRF) { + if (inst->regs_written > 1) { + /* We don't know how to trim channels from an instruction's + * writes, so we can't incrementally remove unread channels from + * it. Just remove whatever it overwrites from the table + */ + for (int i = 0; i < inst->regs_written; i++) { + remove_dead_code_hash(ht, + inst->dst.reg, + inst->dst.reg_offset + i); + } + } else { + struct hash_entry *entry = + get_dead_code_hash_entry(ht, inst->dst.reg, + inst->dst.reg_offset); + + if (inst->is_partial_write()) { + /* For a partial write, we can't remove any previous dead code + * candidate, since we're just modifying their result, but we can + * be dead code eliminiated ourselves. + */ + if (entry) { + entry->data = inst; + } else { + insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset, + inst); + } + } else { + if (entry) { + /* We're completely updating a channel, and there was a + * previous write to the channel that wasn't read. Kill it! + */ + fs_inst *inst = (fs_inst *)entry->data; + inst->remove(); + progress = true; + _mesa_hash_table_remove(ht, entry); + } + + insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset, + inst); + } + } + } + } + + _mesa_hash_table_destroy(ht, NULL); + + if (progress) + live_intervals_valid = false; + + return progress; +} + /** * Implements a second type of register coalescing: This one checks if * the two regs involved in a raw move don't interfere, in which case @@ -1819,7 +1976,7 @@ fs_visitor::register_coalesce_2() fs_inst *inst = (fs_inst *)node; if (inst->opcode != BRW_OPCODE_MOV || - inst->predicate || + inst->is_partial_write() || inst->saturate || inst->src[0].file != GRF || inst->src[0].negate || @@ -1920,7 +2077,7 @@ fs_visitor::register_coalesce() continue; if (inst->opcode != BRW_OPCODE_MOV || - inst->predicate || + inst->is_partial_write() || inst->saturate || inst->dst.file != GRF || (inst->src[0].file != GRF && inst->src[0].file != UNIFORM)|| @@ -1929,6 +2086,7 @@ fs_visitor::register_coalesce() bool has_source_modifiers = (inst->src[0].abs || inst->src[0].negate || + inst->src[0].smear != -1 || inst->src[0].file == UNIFORM); /* Found a move of a GRF to a GRF. Let's see if we can coalesce @@ -2021,7 +2179,7 @@ fs_visitor::compute_to_mrf() next_ip++; if (inst->opcode != BRW_OPCODE_MOV || - inst->predicate || + inst->is_partial_write() || inst->dst.file != MRF || inst->src[0].file != GRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) @@ -2044,7 +2202,7 @@ fs_visitor::compute_to_mrf() /* Can't compute-to-MRF this GRF if someone else was going to * read it later. */ - if (this->virtual_grf_use[inst->src[0].reg] > ip) + if (this->virtual_grf_end[inst->src[0].reg] > ip) continue; /* Found a move of a GRF to a MRF. Let's see if we can go @@ -2060,32 +2218,25 @@ fs_visitor::compute_to_mrf() * into a compute-to-MRF. */ - /* SENDs can only write to GRFs, so no compute-to-MRF. */ - if (scan_inst->mlen) { - break; - } - - /* If it's predicated, it (probably) didn't populate all - * the channels. We might be able to rewrite everything + /* If this one instruction didn't populate all the + * channels, bail. We might be able to rewrite everything * that writes that reg, but it would require smarter * tracking to delay the rewriting until complete success. */ - if (scan_inst->predicate) + if (scan_inst->is_partial_write()) break; - /* If it's half of register setup and not the same half as - * our MOV we're trying to remove, bail for now. - */ - if (scan_inst->force_uncompressed != inst->force_uncompressed || - scan_inst->force_sechalf != inst->force_sechalf) { - break; - } + /* Things returning more than one register would need us to + * understand coalescing out more than one MOV at a time. + */ + if (scan_inst->regs_written > 1) + break; /* SEND instructions can't have MRF as a destination. */ if (scan_inst->mlen) break; - if (intel->gen >= 6) { + if (intel->gen == 6) { /* gen6 math instructions must have the destination be * GRF, so no compute-to-MRF for them. */ @@ -2105,16 +2256,12 @@ fs_visitor::compute_to_mrf() break; } - /* We don't handle flow control here. Most computation of + /* We don't handle control flow here. Most computation of * values that end up in MRFs are shortly before the MRF * write anyway. */ - if (scan_inst->opcode == BRW_OPCODE_DO || - scan_inst->opcode == BRW_OPCODE_WHILE || - scan_inst->opcode == BRW_OPCODE_ELSE || - scan_inst->opcode == BRW_OPCODE_ENDIF) { + if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF) break; - } /* You can't read from an MRF, so if someone else reads our * MRF's source GRF that we wanted to rewrite, that stops us. @@ -2198,16 +2345,8 @@ fs_visitor::remove_duplicate_mrf_writes() foreach_list_safe(node, &this->instructions) { fs_inst *inst = (fs_inst *)node; - switch (inst->opcode) { - case BRW_OPCODE_DO: - case BRW_OPCODE_WHILE: - case BRW_OPCODE_IF: - case BRW_OPCODE_ELSE: - case BRW_OPCODE_ENDIF: + if (inst->is_control_flow()) { memset(last_mrf_move, 0, sizeof(last_mrf_move)); - continue; - default: - break; } if (inst->opcode == BRW_OPCODE_MOV && @@ -2247,7 +2386,7 @@ fs_visitor::remove_duplicate_mrf_writes() if (inst->opcode == BRW_OPCODE_MOV && inst->dst.file == MRF && inst->src[0].file == GRF && - !inst->predicate) { + !inst->is_partial_write()) { last_mrf_move[inst->dst.reg] = inst; } } @@ -2258,21 +2397,293 @@ fs_visitor::remove_duplicate_mrf_writes() return progress; } +static void +clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps, + int first_grf, int grf_len) +{ + bool inst_16wide = (dispatch_width > 8 && + !inst->force_uncompressed && + !inst->force_sechalf); + + /* Clear the flag for registers that actually got read (as expected). */ + for (int i = 0; i < 3; i++) { + int grf; + if (inst->src[i].file == GRF) { + grf = inst->src[i].reg; + } else if (inst->src[i].file == HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + grf = inst->src[i].fixed_hw_reg.nr; + } else { + continue; + } + + if (grf >= first_grf && + grf < first_grf + grf_len) { + deps[grf - first_grf] = false; + if (inst_16wide) + deps[grf - first_grf + 1] = false; + } + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not + * check for post destination dependencies on this instruction, software + * must ensure that there is no destination hazard for the case of ‘write + * followed by a posted write’ shown in the following example. + * + * 1. mov r3 0 + * 2. send r3.xy + * 3. mov r2 r3 + * + * Due to no post-destination dependency check on the ‘send’, the above + * code sequence could have two instructions (1 and 2) in flight at the + * same time that both consider ‘r3’ as the target of their final writes. + */ +void +fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) +{ + int reg_size = dispatch_width / 8; + int write_len = inst->regs_written * reg_size; + int first_write_grf = inst->dst.reg; + bool needs_dep[BRW_MAX_MRF]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + + clear_deps_for_inst_src(inst, dispatch_width, + needs_dep, first_write_grf, write_len); + + /* Walk backwards looking for writes to registers we're writing which + * aren't read since being written. If we hit the start of the program, + * we assume that there are no outstanding dependencies on entry to the + * program. + */ + for (fs_inst *scan_inst = (fs_inst *)inst->prev; + scan_inst != NULL; + scan_inst = (fs_inst *)scan_inst->prev) { + + /* If we hit control flow, assume that there *are* outstanding + * dependencies, and force their cleanup before our instruction. + */ + if (scan_inst->is_control_flow()) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) { + inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); + } + } + return; + } + + bool scan_inst_16wide = (dispatch_width > 8 && + !scan_inst->force_uncompressed && + !scan_inst->force_sechalf); + + /* We insert our reads as late as possible on the assumption that any + * instruction but a MOV that might have left us an outstanding + * dependency has more latency than a MOV. + */ + if (scan_inst->dst.file == GRF) { + for (int i = 0; i < scan_inst->regs_written; i++) { + int reg = scan_inst->dst.reg + i * reg_size; + + if (reg >= first_write_grf && + reg < first_write_grf + write_len && + needs_dep[reg - first_write_grf]) { + inst->insert_before(DEP_RESOLVE_MOV(reg)); + needs_dep[reg - first_write_grf] = false; + if (scan_inst_16wide) + needs_dep[reg - first_write_grf + 1] = false; + } + } + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, dispatch_width, + needs_dep, first_write_grf, write_len); + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Errata: A destination register from a send can not be + * used as a destination register until after it has been sourced by an + * instruction with a different destination register. + */ +void +fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst) +{ + int write_len = inst->regs_written * dispatch_width / 8; + int first_write_grf = inst->dst.reg; + bool needs_dep[BRW_MAX_MRF]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + /* Walk forwards looking for writes to registers we're writing which aren't + * read before being written. + */ + for (fs_inst *scan_inst = (fs_inst *)inst->next; + !scan_inst->is_tail_sentinel(); + scan_inst = (fs_inst *)scan_inst->next) { + /* If we hit control flow, force resolve all remaining dependencies. */ + if (scan_inst->is_control_flow()) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); + } + return; + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, dispatch_width, + needs_dep, first_write_grf, write_len); + + /* We insert our reads as late as possible since they're reading the + * result of a SEND, which has massive latency. + */ + if (scan_inst->dst.file == GRF && + scan_inst->dst.reg >= first_write_grf && + scan_inst->dst.reg < first_write_grf + write_len && + needs_dep[scan_inst->dst.reg - first_write_grf]) { + scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg)); + needs_dep[scan_inst->dst.reg - first_write_grf] = false; + } + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } + + /* If we hit the end of the program, resolve all remaining dependencies out + * of paranoia. + */ + fs_inst *last_inst = (fs_inst *)this->instructions.get_tail(); + assert(last_inst->eot); + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); + } +} + +void +fs_visitor::insert_gen4_send_dependency_workarounds() +{ + if (intel->gen != 4 || intel->is_g4x) + return; + + /* Note that we're done with register allocation, so GRF fs_regs always + * have a .reg_offset of 0. + */ + + foreach_list_safe(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + if (inst->mlen != 0 && inst->dst.file == GRF) { + insert_gen4_pre_send_dependency_workarounds(inst); + insert_gen4_post_send_dependency_workarounds(inst); + } + } +} + +/** + * Turns the generic expression-style uniform pull constant load instruction + * into a hardware-specific series of instructions for loading a pull + * constant. + * + * The expression style allows the CSE pass before this to optimize out + * repeated loads from the same offset, and gives the pre-register-allocation + * scheduling full flexibility, while the conversion to native instructions + * allows the post-register-allocation scheduler the best information + * possible. + * + * Note that execution masking for setting up pull constant loads is special: + * the channels that need to be written are unrelated to the current execution + * mask, since a later instruction will use one of the result channels as a + * source operand for all 8 or 16 of its channels. + */ +void +fs_visitor::lower_uniform_pull_constant_loads() +{ + foreach_list(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) + continue; + + if (intel->gen >= 7) { + /* The offset arg before was a vec4-aligned byte offset. We need to + * turn it into a dword offset. + */ + fs_reg const_offset_reg = inst->src[1]; + assert(const_offset_reg.file == IMM && + const_offset_reg.type == BRW_REGISTER_TYPE_UD); + const_offset_reg.imm.u /= 4; + fs_reg payload = fs_reg(this, glsl_type::uint_type); + + /* This is actually going to be a MOV, but since only the first dword + * is accessed, we have a special opcode to do just that one. Note + * that this needs to be an operation that will be considered a def + * by live variable analysis, or register allocation will explode. + */ + fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET, + payload, const_offset_reg); + setup->force_writemask_all = true; + + setup->ir = inst->ir; + setup->annotation = inst->annotation; + inst->insert_before(setup); + + /* Similarly, this will only populate the first 4 channels of the + * result register (since we only use smear values from 0-3), but we + * don't tell the optimizer. + */ + inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; + inst->src[1] = payload; + + this->live_intervals_valid = false; + } else { + /* Before register allocation, we didn't tell the scheduler about the + * MRF we use. We know it's safe to use this MRF because nothing + * else does except for register spill/unspill, which generates and + * uses its MRF within a single IR instruction. + */ + inst->base_mrf = 14; + inst->mlen = 1; + } + } +} + void -fs_visitor::dump_instruction(fs_inst *inst) +fs_visitor::dump_instruction(backend_instruction *be_inst) { + fs_inst *inst = (fs_inst *)be_inst; + if (inst->predicate) { printf("(%cf0.%d) ", inst->predicate_inverse ? '-' : '+', inst->flag_subreg); } - if (inst->opcode < ARRAY_SIZE(opcode_descs) && - opcode_descs[inst->opcode].name) { - printf("%s", opcode_descs[inst->opcode].name); - } else { - printf("op%d", inst->opcode); - } + printf("%s", brw_instruction_name(inst->opcode)); if (inst->saturate) printf(".sat"); if (inst->conditional_mod) { @@ -2330,6 +2741,22 @@ fs_visitor::dump_instruction(fs_inst *inst) case BAD_FILE: printf("(null)"); break; + case IMM: + switch (inst->src[i].type) { + case BRW_REGISTER_TYPE_F: + printf("%ff", inst->src[i].imm.f); + break; + case BRW_REGISTER_TYPE_D: + printf("%dd", inst->src[i].imm.i); + break; + case BRW_REGISTER_TYPE_UD: + printf("%uu", inst->src[i].imm.u); + break; + default: + printf("???"); + break; + } + break; default: printf("???"); break; @@ -2352,17 +2779,6 @@ fs_visitor::dump_instruction(fs_inst *inst) printf("\n"); } -void -fs_visitor::dump_instructions() -{ - int ip = 0; - foreach_list(node, &this->instructions) { - fs_inst *inst = (fs_inst *)node; - printf("%d: ", ip++); - dump_instruction(inst); - } -} - /** * Possibly returns an instruction that set up @param reg. * @@ -2382,9 +2798,7 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, fs_reg reg) { if (end == start || - end->predicate || - end->force_uncompressed || - end->force_sechalf || + end->is_partial_write() || reg.reladdr || !reg.equals(end->dst)) { return NULL; @@ -2396,9 +2810,8 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, void fs_visitor::setup_payload_gen6() { - struct intel_context *intel = &brw->intel; bool uses_depth = - (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0; + (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0; unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes; assert(intel->gen >= 6); @@ -2499,14 +2912,12 @@ fs_visitor::run() if (failed) return false; - if (INTEL_DEBUG & DEBUG_SHADER_TIME) - emit_shader_time_end(); + emit(FS_OPCODE_PLACEHOLDER_HALT); emit_fb_writes(); split_virtual_grfs(); - setup_paramvalues_refs(); move_uniform_array_access_to_pull_constants(); setup_pull_constants(); @@ -2522,6 +2933,7 @@ fs_visitor::run() progress = opt_cse() || progress; progress = opt_copy_propagate() || progress; progress = dead_code_eliminate() || progress; + progress = dead_code_eliminate_local() || progress; progress = register_coalesce() || progress; progress = register_coalesce_2() || progress; progress = compute_to_mrf() || progress; @@ -2531,6 +2943,8 @@ fs_visitor::run() schedule_instructions(false); + lower_uniform_pull_constant_loads(); + assign_curb_setup(); assign_urb_setup(); @@ -2553,6 +2967,12 @@ fs_visitor::run() assert(force_uncompressed_stack == 0); assert(force_sechalf_stack == 0); + /* This must come after all optimization and register allocation, since + * it inserts dead code that happens to have side effects, and it does + * so based on the actual physical registers in use. + */ + insert_gen4_send_dependency_workarounds(); + if (failed) return false; @@ -2588,7 +3008,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, bool start_busy = false; float start_time = 0; - if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { + if (unlikely(intel->perf_debug)) { start_busy = (intel->batch.last_bo && drm_intel_bo_busy(intel->batch.last_bo)); start_time = get_time(); @@ -2599,7 +3019,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; if (unlikely(INTEL_DEBUG & DEBUG_WM)) { - if (shader) { + if (prog) { printf("GLSL IR for native fragment shader %d:\n", prog->Name); _mesa_print_ir(shader->ir, NULL); printf("\n\n"); @@ -2614,18 +3034,21 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, */ fs_visitor v(brw, c, prog, fp, 8); if (!v.run()) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); + if (prog) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, v.fail_msg); + } _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", - v.fail_msg); + v.fail_msg); return NULL; } exec_list *simd16_instructions = NULL; fs_visitor v2(brw, c, prog, fp, 16); - if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { + bool no16 = INTEL_DEBUG & DEBUG_NO16; + if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) { v2.import_uniforms(&v); if (!v2.run()) { perf_debug("16-wide shader failed to compile, falling back to " @@ -2642,7 +3065,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, simd16_instructions, final_assembly_size); - if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) { + if (unlikely(intel->perf_debug) && shader) { if (shader->compiled_once) brw_wm_debug_recompile(brw, prog, &c->key); shader->compiled_once = true; @@ -2685,28 +3108,20 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; } - if (prog->Name != 0) - key.proj_attrib_mask = 0xffffffff; - if (intel->gen < 6) - key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); + key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS); - for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { + for (int i = 0; i < VARYING_SLOT_MAX; i++) { if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) continue; - if (prog->Name == 0) - key.proj_attrib_mask |= 1 << i; - if (intel->gen < 6) { - int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); - - if (vp_index >= 0) - key.vp_outputs_written |= BITFIELD64_BIT(vp_index); + if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) + key.input_slots_valid |= BITFIELD64_BIT(i); } } - key.clamp_fragment_color = true; + key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT; for (int i = 0; i < MAX_SAMPLERS; i++) { if (fp->Base.ShadowSamplers & (1 << i)) { @@ -2719,11 +3134,11 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) } } - if (fp->Base.InputsRead & FRAG_BIT_WPOS) { + if (fp->Base.InputsRead & VARYING_BIT_POS) { key.drawable_height = ctx->DrawBuffer->Height; } - if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) { + if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) { key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); }