X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs_generator.cpp;h=05a2db4e5ac07e993936ebeed74cd7cd92356807;hb=46c35c61e9c5c1b56fdd9fcd4eb45591dd16d21d;hp=6efd41cdea28f34c709ef67f6259daf0cd626495;hpb=a39622452069b76f7562472cc15ddefd2db4b503;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 6efd41cdea2..05a2db4e5ac 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -27,27 +27,114 @@ * native instructions. */ -extern "C" { #include "main/macros.h" #include "brw_context.h" #include "brw_eu.h" -} /* extern "C" */ - #include "brw_fs.h" #include "brw_cfg.h" +static uint32_t brw_file_from_reg(fs_reg *reg) +{ + switch (reg->file) { + case GRF: + return BRW_GENERAL_REGISTER_FILE; + case MRF: + return BRW_MESSAGE_REGISTER_FILE; + case IMM: + return BRW_IMMEDIATE_VALUE; + default: + unreachable("not reached"); + } +} + +static struct brw_reg +brw_reg_from_fs_reg(fs_reg *reg) +{ + struct brw_reg brw_reg; + + switch (reg->file) { + case GRF: + case MRF: + if (reg->stride == 0) { + brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0); + } else if (reg->width < 8) { + brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); + brw_reg = stride(brw_reg, reg->width * reg->stride, + reg->width, reg->stride); + } else { + /* From the Haswell PRM: + * + * VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries. + * + * So, for registers with width > 8, we have to use a width of 8 + * and trust the compression state to sort out the exec size. + */ + brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); + brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride); + } + + brw_reg = retype(brw_reg, reg->type); + brw_reg = byte_offset(brw_reg, reg->subreg_offset); + break; + case IMM: + switch (reg->type) { + case BRW_REGISTER_TYPE_F: + brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f); + break; + case BRW_REGISTER_TYPE_D: + brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d); + break; + case BRW_REGISTER_TYPE_UD: + brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud); + break; + case BRW_REGISTER_TYPE_W: + brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d); + break; + case BRW_REGISTER_TYPE_UW: + brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud); + break; + case BRW_REGISTER_TYPE_VF: + brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud); + break; + default: + unreachable("not reached"); + } + break; + case HW_REG: + assert(reg->type == reg->fixed_hw_reg.type); + brw_reg = reg->fixed_hw_reg; + break; + case BAD_FILE: + /* Probably unused. */ + brw_reg = brw_null_reg(); + break; + default: + unreachable("not reached"); + } + if (reg->abs) + brw_reg = brw_abs(brw_reg); + if (reg->negate) + brw_reg = negate(brw_reg); + + return brw_reg; +} + fs_generator::fs_generator(struct brw_context *brw, void *mem_ctx, - const struct brw_wm_prog_key *key, - struct brw_wm_prog_data *prog_data, - struct gl_shader_program *prog, - struct gl_fragment_program *fp, + const void *key, + struct brw_stage_prog_data *prog_data, + struct gl_program *prog, + unsigned promoted_constants, bool runtime_check_aads_emit, - bool debug_flag) + const char *stage_abbrev) - : brw(brw), key(key), prog_data(prog_data), prog(prog), fp(fp), - runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(debug_flag), - mem_ctx(mem_ctx) + : brw(brw), key(key), + prog_data(prog_data), + prog(prog), promoted_constants(promoted_constants), + runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), + stage_abbrev(stage_abbrev), mem_ctx(mem_ctx) { ctx = &brw->ctx; @@ -59,6 +146,18 @@ fs_generator::~fs_generator() { } +class ip_record : public exec_node { +public: + DECLARE_RALLOC_CXX_OPERATORS(ip_record) + + ip_record(int ip) + { + this->ip = ip; + } + + int ip; +}; + bool fs_generator::patch_discard_jumps_to_fb_writes() { @@ -98,26 +197,31 @@ fs_generator::patch_discard_jumps_to_fb_writes() void fs_generator::fire_fb_write(fs_inst *inst, - GLuint base_reg, + struct brw_reg payload, struct brw_reg implied_header, GLuint nr) { uint32_t msg_control; + brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; + if (brw->gen < 6) { brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_MOV(p, - brw_message_reg(base_reg + 1), - brw_vec8_grf(1, 0)); + brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0)); brw_pop_insn_state(p); } - if (prog_data->dual_src_blend) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; - else if (dispatch_width == 16) + if (inst->opcode == FS_OPCODE_REP_FB_WRITE) + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; + else if (prog_data->dual_src_blend) { + if (dispatch_width == 8 || !inst->eot) + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; + else + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; + } else if (dispatch_width == 16) msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; else msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; @@ -125,25 +229,39 @@ fs_generator::fire_fb_write(fs_inst *inst, uint32_t surf_index = prog_data->binding_table.render_target_start + inst->target; + bool last_render_target = inst->eot || + (prog_data->dual_src_blend && dispatch_width == 16); + + brw_fb_WRITE(p, dispatch_width, - base_reg, + payload, implied_header, msg_control, surf_index, nr, 0, inst->eot, + last_render_target, inst->header_present); brw_mark_surface_used(&prog_data->base, surf_index); } void -fs_generator::generate_fb_write(fs_inst *inst) +fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) { + brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; + const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key; struct brw_reg implied_header; + if (brw->gen < 8 && !brw->is_haswell) { + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + } + + if (inst->base_mrf >= 0) + payload = brw_message_reg(inst->base_mrf); + /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied * move, here's g1. */ @@ -157,7 +275,7 @@ fs_generator::generate_fb_write(fs_inst *inst) /* On HSW, the GPU will use the predicate on SENDC, unless the header is * present. */ - if ((fp && fp->UsesKill) || key->alpha_test_func) { + if (prog_data->uses_kill) { struct brw_reg pixel_mask; if (brw->gen >= 6) @@ -171,7 +289,7 @@ fs_generator::generate_fb_write(fs_inst *inst) if (brw->gen >= 6) { brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); brw_MOV(p, - retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD), + retype(payload, BRW_REGISTER_TYPE_UD), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); @@ -180,16 +298,15 @@ fs_generator::generate_fb_write(fs_inst *inst) * header. */ brw_OR(p, - vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)), + vec1(retype(payload, BRW_REGISTER_TYPE_UD)), vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), brw_imm_ud(0x1 << 11)); } if (inst->target > 0) { /* Set the render target index for choosing BLEND_STATE. */ - brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, - inst->base_mrf, 2), - BRW_REGISTER_TYPE_UD), + brw_MOV(p, retype(vec1(suboffset(payload, 2)), + BRW_REGISTER_TYPE_UD), brw_imm_ud(inst->target)); } @@ -204,7 +321,7 @@ fs_generator::generate_fb_write(fs_inst *inst) } if (!runtime_check_aads_emit) { - fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen); + fire_fb_write(inst, payload, implied_header, inst->mlen); } else { /* This can only happen in gen < 6 */ assert(brw->gen < 6); @@ -223,25 +340,47 @@ fs_generator::generate_fb_write(fs_inst *inst) brw_inst_set_exec_size(brw, brw_last_inst, BRW_EXECUTE_1); { /* Don't send AA data */ - fire_fb_write(inst, inst->base_mrf+1, implied_header, inst->mlen-1); + fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); } brw_land_fwd_jump(p, jmp); - fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen); + fire_fb_write(inst, payload, implied_header, inst->mlen); } } +void +fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) +{ + brw_inst *insn; + + insn = brw_next_insn(p, BRW_OPCODE_SEND); + + brw_set_dest(p, insn, brw_null_reg()); + brw_set_src0(p, insn, payload); + brw_set_src1(p, insn, brw_imm_d(0)); + + brw_inst_set_sfid(brw, insn, BRW_SFID_URB); + brw_inst_set_urb_opcode(brw, insn, GEN8_URB_OPCODE_SIMD8_WRITE); + + brw_inst_set_mlen(brw, insn, inst->mlen); + brw_inst_set_rlen(brw, insn, 0); + brw_inst_set_eot(brw, insn, inst->eot); + brw_inst_set_header_present(brw, insn, true); + brw_inst_set_urb_global_offset(brw, insn, inst->offset); +} + void fs_generator::generate_blorp_fb_write(fs_inst *inst) { brw_fb_WRITE(p, 16 /* dispatch_width */, - inst->base_mrf, + brw_message_reg(inst->base_mrf), brw_reg_from_fs_reg(&inst->src[0]), BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, inst->target, inst->mlen, 0, true, + true, inst->header_present); } @@ -304,12 +443,13 @@ fs_generator::generate_math_gen6(fs_inst *inst, struct brw_reg src1) { int op = brw_math_function(inst->opcode); - bool binop = src1.file == BRW_GENERAL_REGISTER_FILE; + bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE; - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - gen6_math(p, dst, op, src0, src1); - - if (dispatch_width == 16) { + if (dispatch_width == 8) { + gen6_math(p, dst, op, src0, src1); + } else if (dispatch_width == 16) { + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + gen6_math(p, firsthalf(dst), op, firsthalf(src0), firsthalf(src1)); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); gen6_math(p, sechalf(dst), op, sechalf(src0), binop ? sechalf(src1) : brw_null_reg()); @@ -326,19 +466,21 @@ fs_generator::generate_math_gen4(fs_inst *inst, assert(inst->mlen >= 1); - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - gen4_math(p, dst, - op, - inst->base_mrf, src, - BRW_MATH_DATA_VECTOR, - BRW_MATH_PRECISION_FULL); - - if (dispatch_width == 16) { + if (dispatch_width == 8) { + gen4_math(p, dst, + op, + inst->base_mrf, src, + BRW_MATH_PRECISION_FULL); + } else if (dispatch_width == 16) { + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + gen4_math(p, firsthalf(dst), + op, + inst->base_mrf, firsthalf(src), + BRW_MATH_PRECISION_FULL); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); gen4_math(p, sechalf(dst), op, inst->base_mrf + 1, sechalf(src), - BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); @@ -364,7 +506,6 @@ fs_generator::generate_math_g45(fs_inst *inst, gen4_math(p, dst, op, inst->base_mrf, src, - BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); } @@ -374,7 +515,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src { int msg_type = -1; int rlen = 4; - uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + uint32_t simd_mode; uint32_t return_format; switch (dst.type) { @@ -389,9 +530,16 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src break; } - if (dispatch_width == 16 && - !inst->force_uncompressed && !inst->force_sechalf) + switch (inst->exec_size) { + case 8: + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; + break; + case 16: simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + break; + default: + unreachable("Invalid width for texture instruction"); + } if (brw->gen >= 5) { switch (inst->opcode) { @@ -528,27 +676,17 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src dst = vec16(dst); } - if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) { - /* The send-from-GRF for SIMD16 texturing with a header has an extra - * hardware register allocated to it, which we need to skip over (since - * our coordinates in the payload are in the even-numbered registers, - * and the header comes right before the first one). - */ - assert(src.file == BRW_GENERAL_REGISTER_FILE); - src.nr++; - } + assert(brw->gen < 7 || !inst->header_present || + src.file == BRW_GENERAL_REGISTER_FILE); - assert(sampler_index.file == BRW_IMMEDIATE_VALUE); assert(sampler_index.type == BRW_REGISTER_TYPE_UD); - uint32_t sampler = sampler_index.dw1.ud; - /* Load the message header if present. If there's a texture offset, * we need to set it up explicitly and load the offset bitfield. * Otherwise, we can use an implied move from g0 to the first message reg. */ if (inst->header_present) { - if (brw->gen < 6 && !inst->texture_offset) { + if (brw->gen < 6 && !inst->offset) { /* Set up an implied move from g0 to the MRF. */ src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); } else { @@ -567,51 +705,92 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src /* Explicitly set up the message header by copying g0 to the MRF. */ brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); - if (inst->texture_offset) { + if (inst->offset) { /* Set the offset bits in DWord 2. */ brw_MOV(p, get_element_ud(header_reg, 2), - brw_imm_ud(inst->texture_offset)); + brw_imm_ud(inst->offset)); } - if (sampler >= 16) { - /* The "Sampler Index" field can only store values between 0 and 15. - * However, we can add an offset to the "Sampler State Pointer" - * field, effectively selecting a different set of 16 samplers. - * - * The "Sampler State Pointer" needs to be aligned to a 32-byte - * offset, and each sampler state is only 16-bytes, so we can't - * exclusively use the offset - we have to use both. - */ - assert(brw->gen >= 8 || brw->is_haswell); - const int sampler_state_size = 16; /* 16 bytes */ - brw_ADD(p, - get_element_ud(header_reg, 3), - get_element_ud(brw_vec8_grf(0, 0), 3), - brw_imm_ud(16 * (sampler / 16) * sampler_state_size)); - } + brw_adjust_sampler_state_pointer(p, header_reg, sampler_index); brw_pop_insn_state(p); } } - uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 || - inst->opcode == SHADER_OPCODE_TG4_OFFSET) - ? prog_data->base.binding_table.gather_texture_start - : prog_data->base.binding_table.texture_start) + sampler; - - brw_SAMPLE(p, - retype(dst, BRW_REGISTER_TYPE_UW), - inst->base_mrf, - src, - surface_index, - sampler % 16, - msg_type, - rlen, - inst->mlen, - inst->header_present, - simd_mode, - return_format); - - brw_mark_surface_used(&prog_data->base, surface_index); + uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || + inst->opcode == SHADER_OPCODE_TG4_OFFSET) + ? prog_data->binding_table.gather_texture_start + : prog_data->binding_table.texture_start; + + if (sampler_index.file == BRW_IMMEDIATE_VALUE) { + uint32_t sampler = sampler_index.dw1.ud; + + brw_SAMPLE(p, + retype(dst, BRW_REGISTER_TYPE_UW), + inst->base_mrf, + src, + sampler + base_binding_table_index, + sampler % 16, + msg_type, + rlen, + inst->mlen, + inst->header_present, + simd_mode, + return_format); + + brw_mark_surface_used(prog_data, sampler + base_binding_table_index); + } else { + /* Non-const sampler index */ + /* Note: this clobbers `dst` as a temporary before emitting the send */ + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD)); + + struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* Some care required: `sampler` and `temp` may alias: + * addr = sampler & 0xff + * temp = (sampler << 8) & 0xf00 + * addr = addr | temp + */ + brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index)); + brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u)); + brw_AND(p, temp, temp, brw_imm_ud(0x0f00)); + brw_AND(p, addr, addr, brw_imm_ud(0x0ff)); + brw_OR(p, addr, addr, temp); + + /* a0.0 |= */ + brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR); + brw_set_sampler_message(p, insn_or, + 0 /* surface */, + 0 /* sampler */, + msg_type, + rlen, + inst->mlen /* mlen */, + inst->header_present /* header */, + simd_mode, + return_format); + brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1); + brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD); + brw_set_src0(p, insn_or, addr); + brw_set_dest(p, insn_or, addr); + + + /* dst = send(offset, a0.0) */ + brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn_send, dst); + brw_set_src0(p, insn_send, src); + brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr); + + brw_pop_insn_state(p); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + } } @@ -644,34 +823,30 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * appropriate swizzling. */ void -fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src, - struct brw_reg quality) +fs_generator::generate_ddx(enum opcode opcode, + struct brw_reg dst, struct brw_reg src) { unsigned vstride, width; - assert(quality.file == BRW_IMMEDIATE_VALUE); - assert(quality.type == BRW_REGISTER_TYPE_D); - - int quality_value = quality.dw1.d; - if (quality_value == BRW_DERIVATIVE_FINE || - (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) { + if (opcode == FS_OPCODE_DDX_FINE) { /* produce accurate derivatives */ vstride = BRW_VERTICAL_STRIDE_2; width = BRW_WIDTH_2; - } - else { + } else { /* replicate the derivative at the top-left pixel to other pixels */ vstride = BRW_VERTICAL_STRIDE_4; width = BRW_WIDTH_4; } struct brw_reg src0 = brw_reg(src.file, src.nr, 1, + src.negate, src.abs, BRW_REGISTER_TYPE_F, vstride, width, BRW_HORIZONTAL_STRIDE_0, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); struct brw_reg src1 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, BRW_REGISTER_TYPE_F, vstride, width, @@ -685,16 +860,11 @@ fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src * left. */ void -fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, - struct brw_reg quality, bool negate_value) +fs_generator::generate_ddy(enum opcode opcode, + struct brw_reg dst, struct brw_reg src, + bool negate_value) { - assert(quality.file == BRW_IMMEDIATE_VALUE); - assert(quality.type == BRW_REGISTER_TYPE_D); - - int quality_value = quality.dw1.d; - - if (quality_value == BRW_DERIVATIVE_FINE || - (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) { + if (opcode == FS_OPCODE_DDY_FINE) { /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register * Region Restrictions): * @@ -721,12 +891,14 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src /* produce accurate derivatives */ struct brw_reg src0 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, BRW_REGISTER_TYPE_F, BRW_VERTICAL_STRIDE_4, BRW_WIDTH_4, BRW_HORIZONTAL_STRIDE_1, BRW_SWIZZLE_XYXY, WRITEMASK_XYZW); struct brw_reg src1 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, BRW_REGISTER_TYPE_F, BRW_VERTICAL_STRIDE_4, BRW_WIDTH_4, @@ -734,17 +906,18 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW); brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_16); - if (unroll_to_simd8) - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - if (negate_value) - brw_ADD(p, dst, src1, negate(src0)); - else - brw_ADD(p, dst, src0, negate(src1)); if (unroll_to_simd8) { - brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); - src0 = sechalf(src0); - src1 = sechalf(src1); - dst = sechalf(dst); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + if (negate_value) { + brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0))); + brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_ADD(p, sechalf(dst), sechalf(src1), negate(sechalf(src0))); + } else { + brw_ADD(p, firsthalf(dst), firsthalf(src0), negate(firsthalf(src1))); + brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_ADD(p, sechalf(dst), sechalf(src0), negate(sechalf(src1))); + } + } else { if (negate_value) brw_ADD(p, dst, src1, negate(src0)); else @@ -754,12 +927,14 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src } else { /* replicate the derivative at the top-left pixel to other pixels */ struct brw_reg src0 = brw_reg(src.file, src.nr, 0, + src.negate, src.abs, BRW_REGISTER_TYPE_F, BRW_VERTICAL_STRIDE_4, BRW_WIDTH_4, BRW_HORIZONTAL_STRIDE_0, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); struct brw_reg src1 = brw_reg(src.file, src.nr, 2, + src.negate, src.abs, BRW_REGISTER_TYPE_F, BRW_VERTICAL_STRIDE_4, BRW_WIDTH_4, @@ -795,10 +970,10 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) assert(inst->mlen != 0); brw_MOV(p, - retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), + brw_uvec_mrf(inst->exec_size, (inst->base_mrf + 1), 0), retype(src, BRW_REGISTER_TYPE_UD)); brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), - dispatch_width / 8, inst->offset); + inst->exec_size / 8, inst->offset); } void @@ -807,13 +982,13 @@ fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) assert(inst->mlen != 0); brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), - dispatch_width / 8, inst->offset); + inst->exec_size / 8, inst->offset); } void fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst) { - gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset); + gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); } void @@ -835,7 +1010,7 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), read_offset, surf_index); - brw_mark_surface_used(&prog_data->base, surf_index); + brw_mark_surface_used(prog_data, surf_index); } void @@ -845,39 +1020,108 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, struct brw_reg offset) { assert(inst->mlen == 0); - - assert(index.file == BRW_IMMEDIATE_VALUE && - index.type == BRW_REGISTER_TYPE_UD); - uint32_t surf_index = index.dw1.ud; + assert(index.type == BRW_REGISTER_TYPE_UD); assert(offset.file == BRW_GENERAL_REGISTER_FILE); /* Reference just the dword we need, to avoid angering validate_reg(). */ offset = brw_vec1_grf(offset.nr, 0); - brw_push_insn_state(p); - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_pop_insn_state(p); - /* We use the SIMD4x2 mode because we want to end up with 4 components in * the destination loaded consecutively from the same offset (which appears * in the first component, and the rest are ignored). */ dst.width = BRW_WIDTH_4; - brw_set_dest(p, send, dst); - brw_set_src0(p, send, offset); - brw_set_sampler_message(p, send, - surf_index, - 0, /* LD message ignores sampler unit */ - GEN5_SAMPLER_MESSAGE_SAMPLE_LD, - 1, /* rlen */ - 1, /* mlen */ - false, /* no header */ - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - 0); - brw_mark_surface_used(&prog_data->base, surf_index); + struct brw_reg src = offset; + bool header_present = false; + int mlen = 1; + + if (brw->gen >= 9) { + /* Skylake requires a message header in order to use SIMD4x2 mode. */ + src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD); + mlen = 2; + header_present = true; + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, src, retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + brw_MOV(p, get_element_ud(src, 2), + brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); + brw_pop_insn_state(p); + } + + if (index.file == BRW_IMMEDIATE_VALUE) { + + uint32_t surf_index = index.dw1.ud; + + brw_push_insn_state(p); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_pop_insn_state(p); + + brw_set_dest(p, send, dst); + brw_set_src0(p, send, src); + brw_set_sampler_message(p, send, + surf_index, + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1, /* rlen */ + mlen, + header_present, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); + + brw_mark_surface_used(prog_data, surf_index); + + } else { + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* a0.0 = surf_index & 0xff */ + brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); + brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1); + brw_set_dest(p, insn_and, addr); + brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); + brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); + + + /* a0.0 |= */ + brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR); + brw_set_sampler_message(p, insn_or, + 0 /* surface */, + 0 /* sampler */, + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + 1 /* rlen */, + mlen, + header_present, + BRW_SAMPLER_SIMD_MODE_SIMD4X2, + 0); + brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1); + brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD); + brw_set_src0(p, insn_or, addr); + brw_set_dest(p, insn_or, addr); + + + /* dst = send(offset, a0.0) */ + brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn_send, dst); + brw_set_src0(p, insn_send, src); + brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr); + + brw_pop_insn_state(p); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + + } } void @@ -944,7 +1188,7 @@ fs_generator::generate_varying_pull_constant_load(fs_inst *inst, simd_mode, return_format); - brw_mark_surface_used(&prog_data->base, surf_index); + brw_mark_surface_used(prog_data, surf_index); } void @@ -959,10 +1203,7 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, */ assert(!inst->header_present); assert(!inst->mlen); - - assert(index.file == BRW_IMMEDIATE_VALUE && - index.type == BRW_REGISTER_TYPE_UD); - uint32_t surf_index = index.dw1.ud; + assert(index.type == BRW_REGISTER_TYPE_UD); uint32_t simd_mode, rlen, mlen; if (dispatch_width == 16) { @@ -975,20 +1216,70 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; } - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_set_dest(p, send, dst); - brw_set_src0(p, send, offset); - brw_set_sampler_message(p, send, - surf_index, - 0, /* LD message ignores sampler unit */ - GEN5_SAMPLER_MESSAGE_SAMPLE_LD, - rlen, - mlen, - false, /* no header */ - simd_mode, - 0); + if (index.file == BRW_IMMEDIATE_VALUE) { - brw_mark_surface_used(&prog_data->base, surf_index); + uint32_t surf_index = index.dw1.ud; + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, send, offset); + brw_set_sampler_message(p, send, + surf_index, + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + rlen, + mlen, + false, /* no header */ + simd_mode, + 0); + + brw_mark_surface_used(prog_data, surf_index); + + } else { + + struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + /* a0.0 = surf_index & 0xff */ + brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); + brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1); + brw_set_dest(p, insn_and, addr); + brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); + brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); + + + /* a0.0 |= */ + brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR); + brw_set_sampler_message(p, insn_or, + 0 /* surface */, + 0 /* sampler */, + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + rlen /* rlen */, + mlen /* mlen */, + false /* header */, + simd_mode, + 0); + brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1); + brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD); + brw_set_src0(p, insn_or, addr); + brw_set_dest(p, insn_or, addr); + + + /* dst = send(offset, a0.0) */ + brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, insn_send, retype(dst, BRW_REGISTER_TYPE_UW)); + brw_set_src0(p, insn_send, offset); + brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr); + + brw_pop_insn_state(p); + + /* visitor knows more than we do about the surface limit required, + * so has already done marking. + */ + } } /** @@ -1035,74 +1326,6 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst, } -static uint32_t brw_file_from_reg(fs_reg *reg) -{ - switch (reg->file) { - case GRF: - return BRW_GENERAL_REGISTER_FILE; - case MRF: - return BRW_MESSAGE_REGISTER_FILE; - case IMM: - return BRW_IMMEDIATE_VALUE; - default: - unreachable("not reached"); - } -} - -struct brw_reg -brw_reg_from_fs_reg(fs_reg *reg) -{ - struct brw_reg brw_reg; - - switch (reg->file) { - case GRF: - case MRF: - if (reg->stride == 0) { - brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0); - } else { - brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); - brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride); - } - - brw_reg = retype(brw_reg, reg->type); - brw_reg = byte_offset(brw_reg, reg->subreg_offset); - break; - case IMM: - switch (reg->type) { - case BRW_REGISTER_TYPE_F: - brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f); - break; - case BRW_REGISTER_TYPE_D: - brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d); - break; - case BRW_REGISTER_TYPE_UD: - brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud); - break; - default: - unreachable("not reached"); - } - break; - case HW_REG: - assert(reg->type == reg->fixed_hw_reg.type); - brw_reg = reg->fixed_hw_reg; - break; - case BAD_FILE: - /* Probably unused. */ - brw_reg = brw_null_reg(); - break; - case UNIFORM: - unreachable("not reached"); - default: - unreachable("not reached"); - } - if (reg->abs) - brw_reg = brw_abs(brw_reg); - if (reg->negate) - brw_reg = negate(brw_reg); - - return brw_reg; -} - /** * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant * sampler LD messages. @@ -1138,10 +1361,7 @@ fs_generator::generate_set_omask(fs_inst *inst, mask.width == BRW_WIDTH_8 && mask.hstride == BRW_HORIZONTAL_STRIDE_1); - bool stride_0_1_0 = - (mask.vstride == BRW_VERTICAL_STRIDE_0 && - mask.width == BRW_WIDTH_1 && - mask.hstride == BRW_HORIZONTAL_STRIDE_0); + bool stride_0_1_0 = has_scalar_region(mask); assert(stride_8_8_1 || stride_0_1_0); assert(dst.type == BRW_REGISTER_TYPE_UW); @@ -1178,33 +1398,15 @@ fs_generator::generate_set_sample_id(fs_inst *inst, brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); brw_set_default_mask_control(p, BRW_MASK_DISABLE); struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW); - brw_ADD(p, dst, src0, reg); - if (dispatch_width == 16) - brw_ADD(p, offset(dst, 1), offset(src0, 1), suboffset(reg, 2)); + if (dispatch_width == 8) { + brw_ADD(p, dst, src0, reg); + } else if (dispatch_width == 16) { + brw_ADD(p, firsthalf(dst), firsthalf(src0), reg); + brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2)); + } brw_pop_insn_state(p); } -/** - * Change the register's data type from UD to W, doubling the strides in order - * to compensate for halving the data type width. - */ -static struct brw_reg -ud_reg_to_w(struct brw_reg r) -{ - assert(r.type == BRW_REGISTER_TYPE_UD); - r.type = BRW_REGISTER_TYPE_W; - - /* The BRW_*_STRIDE enums are defined so that incrementing the field - * doubles the real stride. - */ - if (r.hstride != 0) - ++r.hstride; - if (r.vstride != 0) - ++r.vstride; - - return r; -} - void fs_generator::generate_pack_half_2x16_split(fs_inst *inst, struct brw_reg dst, @@ -1225,9 +1427,9 @@ fs_generator::generate_pack_half_2x16_split(fs_inst *inst, * (HorzStride) of 2. The 16-bit result is stored in the lower word of * each destination channel and the upper word is not modified. */ - struct brw_reg dst_w = ud_reg_to_w(dst); + struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); - /* Give each 32-bit channel of dst the form below , where "." means + /* Give each 32-bit channel of dst the form below, where "." means * unchanged. * 0x....hhhh */ @@ -1259,7 +1461,7 @@ fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, * the source data type must be Word (W). The destination type must be * F (Float). */ - struct brw_reg src_w = ud_reg_to_w(src); + struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2); /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll. * For the Y case, we wish to access only the upper word; therefore @@ -1305,15 +1507,16 @@ fs_generator::generate_shader_time_add(fs_inst *inst, brw_MOV(p, payload_offset, offset); brw_MOV(p, payload_value, value); brw_shader_time_add(p, payload, - prog_data->base.binding_table.shader_time_start); + prog_data->binding_table.shader_time_start); brw_pop_insn_state(p); - brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.shader_time_start); + brw_mark_surface_used(prog_data, + prog_data->binding_table.shader_time_start); } void fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst, + struct brw_reg payload, struct brw_reg atomic_op, struct brw_reg surf_index) { @@ -1322,42 +1525,56 @@ fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst, surf_index.file == BRW_IMMEDIATE_VALUE && surf_index.type == BRW_REGISTER_TYPE_UD); - brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf), - atomic_op.dw1.ud, surf_index.dw1.ud, - inst->mlen, dispatch_width / 8); + brw_untyped_atomic(p, dst, payload, atomic_op.dw1.ud, surf_index.dw1.ud, + inst->mlen, inst->exec_size / 8); - brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); + brw_mark_surface_used(prog_data, surf_index.dw1.ud); } void fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst, + struct brw_reg payload, struct brw_reg surf_index) { assert(surf_index.file == BRW_IMMEDIATE_VALUE && surf_index.type == BRW_REGISTER_TYPE_UD); - brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf), + brw_untyped_surface_read(p, dst, payload, surf_index.dw1.ud, - inst->mlen, dispatch_width / 8); + inst->mlen, inst->exec_size / 8); - brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); + brw_mark_surface_used(prog_data, surf_index.dw1.ud); } void -fs_generator::generate_code(exec_list *instructions) +fs_generator::enable_debug(const char *shader_name) +{ + debug_flag = true; + this->shader_name = shader_name; +} + +int +fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) { + /* align to 64 byte boundary. */ + while (p->next_insn_offset % 64) + brw_NOP(p); + + this->dispatch_width = dispatch_width; + if (dispatch_width == 16) + brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + int start_offset = p->next_insn_offset; + int spill_count = 0, fill_count = 0; + int loop_count = 0; struct annotation_info annotation; memset(&annotation, 0, sizeof(annotation)); - cfg_t *cfg = NULL; - if (unlikely(debug_flag)) - cfg = new(mem_ctx) cfg_t(instructions); - - foreach_in_list(fs_inst, inst, instructions) { + foreach_block_and_inst (block, fs_inst, inst, cfg) { struct brw_reg src[3], dst; unsigned int last_insn_offset = p->next_insn_offset; + bool multiple_instructions_emitted = false; if (unlikely(debug_flag)) annotate(brw, &annotation, cfg, inst, p->next_insn_offset); @@ -1384,12 +1601,25 @@ fs_generator::generate_code(exec_list *instructions) brw_set_default_mask_control(p, inst->force_writemask_all); brw_set_default_acc_write_control(p, inst->writes_accumulator); - if (inst->force_uncompressed || dispatch_width == 8) { - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - } else if (inst->force_sechalf) { - brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); - } else { - brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + switch (inst->exec_size) { + case 1: + case 2: + case 4: + assert(inst->force_writemask_all); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + break; + case 8: + if (inst->force_sechalf) { + brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); + } else { + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + } + break; + case 16: + brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + break; + default: + unreachable(!"Invalid instruction width"); } switch (inst->opcode) { @@ -1409,15 +1639,25 @@ fs_generator::generate_code(exec_list *instructions) brw_MACH(p, dst, src[0], src[1]); break; + case BRW_OPCODE_LINE: + brw_LINE(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_MAD: assert(brw->gen >= 6); brw_set_default_access_mode(p, BRW_ALIGN_16); if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) { brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_MAD(p, dst, src[0], src[1], src[2]); + brw_inst *f = brw_MAD(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); - brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); + brw_inst *s = brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + + if (inst->conditional_mod) { + brw_inst_set_cond_modifier(brw, f, inst->conditional_mod); + brw_inst_set_cond_modifier(brw, s, inst->conditional_mod); + multiple_instructions_emitted = true; + } } else { brw_MAD(p, dst, src[0], src[1], src[2]); } @@ -1429,10 +1669,16 @@ fs_generator::generate_code(exec_list *instructions) brw_set_default_access_mode(p, BRW_ALIGN_16); if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) { brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_LRP(p, dst, src[0], src[1], src[2]); + brw_inst *f = brw_LRP(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); - brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); + brw_inst *s = brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + + if (inst->conditional_mod) { + brw_inst_set_cond_modifier(brw, f, inst->conditional_mod); + brw_inst_set_cond_modifier(brw, s, inst->conditional_mod); + multiple_instructions_emitted = true; + } } else { brw_LRP(p, dst, src[0], src[1], src[2]); } @@ -1482,7 +1728,42 @@ fs_generator::generate_code(exec_list *instructions) brw_F16TO32(p, dst, src[0]); break; case BRW_OPCODE_CMP: - brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says + * that when the destination is a GRF that the dependency-clear bit on + * the flag register is cleared early. + * + * Suggested workarounds are to disable coissuing CMP instructions + * or to split CMP(16) instructions into two CMP(8) instructions. + * + * We choose to split into CMP(8) instructions since disabling + * coissuing would affect CMP instructions not otherwise affected by + * the errata. + */ + if (dispatch_width == 16 && brw->gen == 7 && !brw->is_haswell) { + if (dst.file == BRW_GENERAL_REGISTER_FILE) { + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + brw_CMP(p, firsthalf(dst), inst->conditional_mod, + firsthalf(src[0]), firsthalf(src[1])); + brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); + brw_CMP(p, sechalf(dst), inst->conditional_mod, + sechalf(src[0]), sechalf(src[1])); + brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + + multiple_instructions_emitted = true; + } else if (dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { + /* For unknown reasons, the aforementioned workaround is not + * sufficient. Overriding the type when the destination is the + * null register is necessary but not sufficient by itself. + */ + assert(dst.nr == BRW_ARF_NULL); + dst.type = BRW_REGISTER_TYPE_D; + brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + } else { + unreachable("not reached"); + } + } else { + brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); + } break; case BRW_OPCODE_SEL: brw_SEL(p, dst, src[0], src[1]); @@ -1525,7 +1806,7 @@ fs_generator::generate_code(exec_list *instructions) brw_set_default_access_mode(p, BRW_ALIGN_16); if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) { brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_BFE(p, dst, src[0], src[1], src[2]); + brw_BFE(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); @@ -1544,7 +1825,7 @@ fs_generator::generate_code(exec_list *instructions) */ if (dispatch_width == 16 && brw->is_haswell) { brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_BFI1(p, dst, src[0], src[1]); + brw_BFI1(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1])); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1])); brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); @@ -1563,9 +1844,9 @@ fs_generator::generate_code(exec_list *instructions) * Otherwise we would be able to emit compressed instructions like we * do for the other three-source instructions. */ - if (dispatch_width == 16) { + if (dispatch_width == 16 && brw->gen < 8) { brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_BFI2(p, dst, src[0], src[1], src[2]); + brw_BFI2(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); @@ -1607,6 +1888,7 @@ fs_generator::generate_code(exec_list *instructions) case BRW_OPCODE_WHILE: brw_WHILE(p); + loop_count++; break; case SHADER_OPCODE_RCP: @@ -1617,6 +1899,7 @@ fs_generator::generate_code(exec_list *instructions) case SHADER_OPCODE_SIN: case SHADER_OPCODE_COS: assert(brw->gen < 6 || inst->mlen == 0); + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); if (brw->gen >= 7) { gen6_math(p, dst, brw_math_function(inst->opcode), src[0], brw_null_reg()); @@ -1632,6 +1915,7 @@ fs_generator::generate_code(exec_list *instructions) case SHADER_OPCODE_INT_REMAINDER: case SHADER_OPCODE_POW: assert(brw->gen < 6 || inst->mlen == 0); + assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); if (brw->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) { gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); } else if (brw->gen >= 6) { @@ -1666,27 +1950,33 @@ fs_generator::generate_code(exec_list *instructions) case SHADER_OPCODE_TG4_OFFSET: generate_tex(inst, dst, src[0], src[1]); break; - case FS_OPCODE_DDX: - generate_ddx(inst, dst, src[0], src[1]); - break; - case FS_OPCODE_DDY: - /* Make sure fp->UsesDFdy flag got set (otherwise there's no - * guarantee that key->render_to_fbo is set). - */ - assert(fp->UsesDFdy); - generate_ddy(inst, dst, src[0], src[1], key->render_to_fbo); + case FS_OPCODE_DDX_COARSE: + case FS_OPCODE_DDX_FINE: + generate_ddx(inst->opcode, dst, src[0]); + break; + case FS_OPCODE_DDY_COARSE: + case FS_OPCODE_DDY_FINE: + assert(src[1].file == BRW_IMMEDIATE_VALUE); + generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud); break; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: generate_scratch_write(inst, src[0]); + spill_count++; break; case SHADER_OPCODE_GEN4_SCRATCH_READ: generate_scratch_read(inst, dst); + fill_count++; break; case SHADER_OPCODE_GEN7_SCRATCH_READ: generate_scratch_read_gen7(inst, dst); + fill_count++; + break; + + case SHADER_OPCODE_URB_WRITE_SIMD8: + generate_urb_write(inst, src[0]); break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: @@ -1705,8 +1995,9 @@ fs_generator::generate_code(exec_list *instructions) generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]); break; + case FS_OPCODE_REP_FB_WRITE: case FS_OPCODE_FB_WRITE: - generate_fb_write(inst); + generate_fb_write(inst, src[0]); break; case FS_OPCODE_BLORP_FB_WRITE: @@ -1726,11 +2017,11 @@ fs_generator::generate_code(exec_list *instructions) break; case SHADER_OPCODE_UNTYPED_ATOMIC: - generate_untyped_atomic(inst, dst, src[0], src[1]); + generate_untyped_atomic(inst, dst, src[0], src[1], src[2]); break; case SHADER_OPCODE_UNTYPED_SURFACE_READ: - generate_untyped_surface_read(inst, dst, src[0]); + generate_untyped_surface_read(inst, dst, src[0], src[1]); break; case FS_OPCODE_SET_SIMD4X2_OFFSET: @@ -1787,10 +2078,11 @@ fs_generator::generate_code(exec_list *instructions) default: if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { - _mesa_problem(ctx, "Unsupported opcode `%s' in FS", - opcode_descs[inst->opcode].name); + _mesa_problem(ctx, "Unsupported opcode `%s' in %s", + opcode_descs[inst->opcode].name, stage_abbrev); } else { - _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); + _mesa_problem(ctx, "Unsupported opcode %d in %s", inst->opcode, + stage_abbrev); } abort(); @@ -1798,6 +2090,9 @@ fs_generator::generate_code(exec_list *instructions) unreachable("Should be lowered by lower_load_payload()"); } + if (multiple_instructions_emitted) + continue; + if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { assert(p->next_insn_offset == last_insn_offset + 16 || !"conditional_mod, no_dd_check, or no_dd_clear set for IR " @@ -1805,7 +2100,8 @@ fs_generator::generate_code(exec_list *instructions) brw_inst *last = &p->store[last_insn_offset / 16]; - brw_inst_set_cond_modifier(brw, last, inst->conditional_mod); + if (inst->conditional_mod) + brw_inst_set_cond_modifier(brw, last, inst->conditional_mod); brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear); brw_inst_set_no_dd_check(brw, last, inst->no_dd_check); } @@ -1820,57 +2116,32 @@ fs_generator::generate_code(exec_list *instructions) int after_size = p->next_insn_offset - start_offset; if (unlikely(debug_flag)) { - if (prog) { - fprintf(stderr, - "Native code for %s fragment shader %d (SIMD%d dispatch):\n", - prog->Label ? prog->Label : "unnamed", - prog->Name, dispatch_width); - } else if (fp) { - fprintf(stderr, - "Native code for fragment program %d (SIMD%d dispatch):\n", - fp->Base.Id, dispatch_width); - } else { - fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n", - dispatch_width); - } - fprintf(stderr, "SIMD%d shader: %d instructions. Compacted %d to %d" - " bytes (%.0f%%)\n", - dispatch_width, before_size / 16, before_size, after_size, + fprintf(stderr, "Native code for %s\n" + "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" + " bytes (%.0f%%)\n", + shader_name, dispatch_width, before_size / 16, loop_count, + spill_count, fill_count, promoted_constants, before_size, after_size, 100.0f * (before_size - after_size) / before_size); - const struct gl_program *prog = fp ? &fp->Base : NULL; - dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog); ralloc_free(annotation.ann); } + + static GLuint msg_id = 0; + _mesa_gl_debug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_OTHER, + MESA_DEBUG_SEVERITY_NOTIFICATION, + "%s SIMD%d shader: %d inst, %d loops, %d:%d spills:fills, " + "Promoted %u constants, compacted %d to %d bytes.\n", + stage_abbrev, dispatch_width, before_size / 16, loop_count, + spill_count, fill_count, promoted_constants, before_size, after_size); + + return start_offset; } const unsigned * -fs_generator::generate_assembly(exec_list *simd8_instructions, - exec_list *simd16_instructions, - unsigned *assembly_size) +fs_generator::get_assembly(unsigned int *assembly_size) { - assert(simd8_instructions || simd16_instructions); - - if (simd8_instructions) { - dispatch_width = 8; - generate_code(simd8_instructions); - } - - if (simd16_instructions) { - /* align to 64 byte boundary. */ - while (p->next_insn_offset % 64) { - brw_NOP(p); - } - - /* Save off the start of this SIMD16 program */ - prog_data->prog_offset_16 = p->next_insn_offset; - - brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); - - dispatch_width = 16; - generate_code(simd16_instructions); - } - return brw_get_program(p, assembly_size); }