X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_generator.cpp;h=74c6cd3047411a7f9a113622e1acb0063e16f954;hb=90b6745bc80cf6dabb8f736dbf12d47c2a6602f5;hp=60944a97d4b4d2c04689f728025f1241521b0e8e;hpb=e7c9adca5726a8c96de20ae7c5f21a30061db392;p=mesa.git diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 60944a97d4b..74c6cd30474 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -30,6 +30,7 @@ #include "brw_eu.h" #include "brw_fs.h" #include "brw_cfg.h" +#include "util/mesa-sha1.h" static enum brw_reg_file brw_file_from_reg(fs_reg *reg) @@ -84,15 +85,24 @@ brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst, const unsigned phys_width = compressed ? inst->exec_size / 2 : inst->exec_size; + const unsigned max_hw_width = 16; + /* XXX - The equation above is strictly speaking not correct on * hardware that supports unbalanced GRF writes -- On Gen9+ * each decompressed chunk of the instruction may have a * different execution size when the number of components * written to each destination GRF is not the same. */ - const unsigned width = MIN2(reg_width, phys_width); - brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); - brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); + if (reg->stride > 4) { + assert(reg != &inst->dst); + assert(reg->stride * type_sz(reg->type) <= REG_SIZE); + brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, reg->stride, 1, 0); + } else { + const unsigned width = MIN3(reg_width, phys_width, max_hw_width); + brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); + } if (devinfo->gen == 7 && !devinfo->is_haswell) { /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13): @@ -175,16 +185,13 @@ brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst, fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, - const void *key, struct brw_stage_prog_data *prog_data, - unsigned promoted_constants, bool runtime_check_aads_emit, gl_shader_stage stage) : compiler(compiler), log_data(log_data), - devinfo(compiler->devinfo), key(key), + devinfo(compiler->devinfo), prog_data(prog_data), - promoted_constants(promoted_constants), runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), stage(stage), mem_ctx(mem_ctx) { @@ -217,25 +224,27 @@ public: bool fs_generator::patch_discard_jumps_to_fb_writes() { - if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) + if (this->discard_halt_patches.is_empty()) return false; int scale = brw_jump_scale(p->devinfo); - /* There is a somewhat strange undocumented requirement of using - * HALT, according to the simulator. If some channel has HALTed to - * a particular UIP, then by the end of the program, every channel - * must have HALTed to that UIP. Furthermore, the tracking is a - * stack, so you can't do the final halt of a UIP after starting - * halting to a new UIP. - * - * Symptoms of not emitting this instruction on actual hardware - * included GPU hangs and sparkly rendering on the piglit discard - * tests. - */ - brw_inst *last_halt = gen6_HALT(p); - brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); - brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + if (devinfo->gen >= 6) { + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + brw_inst *last_halt = brw_HALT(p); + brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); + brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); + } int ip = p->nr_insn; @@ -243,22 +252,111 @@ fs_generator::patch_discard_jumps_to_fb_writes() brw_inst *patch = &p->store[patch_ip->ip]; assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); - /* HALT takes a half-instruction distance from the pre-incremented IP. */ - brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + if (devinfo->gen >= 6) { + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); + } else { + brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale)); + } } this->discard_halt_patches.make_empty(); + + if (devinfo->gen < 6) { + /* From the g965 PRM: + * + * "As DMask is not automatically reloaded into AMask upon completion + * of this instruction, software has to manually restore AMask upon + * completion." + * + * DMask lives in the bottom 16 bits of sr0.1. + */ + brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK), + retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW)); + brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1); + brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE); + brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE); + brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH); + } + + if (devinfo->gen == 4 && !devinfo->is_g4x) { + /* From the g965 PRM: + * + * "[DevBW, DevCL] Erratum: The subfields in mask stack register are + * reset to zero during graphics reset, however, they are not + * initialized at thread dispatch. These subfields will retain the + * values from the previous thread. Software should make sure the + * mask stack is empty (reset to zero) before terminating the thread. + * In case that this is not practical, software may have to reset the + * mask stack at the beginning of each kernel, which will impact the + * performance." + * + * Luckily we can rely on: + * + * "[DevBW, DevCL] This register access restriction is not + * applicable, hardware does ensure execution pipeline coherency, + * when a mask stack register is used as an explicit source and/or + * destination." + */ + brw_push_insn_state(p); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); + + brw_set_default_exec_size(p, BRW_EXECUTE_2); + brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0)); + + brw_set_default_exec_size(p, BRW_EXECUTE_16); + /* Reset the if stack. */ + brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW), + brw_imm_uw(0)); + + brw_pop_insn_state(p); + } + return true; } +void +fs_generator::generate_send(fs_inst *inst, + struct brw_reg dst, + struct brw_reg desc, + struct brw_reg ex_desc, + struct brw_reg payload, + struct brw_reg payload2) +{ + const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE && + dst.nr == BRW_ARF_NULL; + const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE; + + uint32_t desc_imm = inst->desc | + brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size); + + uint32_t ex_desc_imm = brw_message_ex_desc(devinfo, inst->ex_mlen); + + if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) { + /* If we have any sort of extended descriptor, then we need SENDS. This + * also covers the dual-payload case because ex_mlen goes in ex_desc. + */ + brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2, + desc, desc_imm, ex_desc, ex_desc_imm, + inst->eot); + if (inst->check_tdr) + brw_inst_set_opcode(p->devinfo, brw_last_inst, + devinfo->gen >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC); + } else { + brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm, + inst->eot); + if (inst->check_tdr) + brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); + } +} + void fs_generator::fire_fb_write(fs_inst *inst, struct brw_reg payload, struct brw_reg implied_header, GLuint nr) { - uint32_t msg_control; - struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); if (devinfo->gen < 6) { @@ -267,21 +365,12 @@ fs_generator::fire_fb_write(fs_inst *inst, brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0)); + brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1), + offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1)); brw_pop_insn_state(p); } - if (inst->opcode == FS_OPCODE_REP_FB_WRITE) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; - else if (prog_data->dual_src_blend) { - if (!inst->group) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; - else - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; - } else if (inst->exec_size == 16) - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; - else - msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; + uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data); /* We assume render targets start at 0, because headerless FB write * messages set "Render Target Index" to 0. Using a different binding @@ -289,107 +378,35 @@ fs_generator::fire_fb_write(fs_inst *inst, */ const uint32_t surf_index = inst->target; - bool last_render_target = inst->eot || - (prog_data->dual_src_blend && dispatch_width == 16); - + brw_inst *insn = brw_fb_WRITE(p, + payload, + retype(implied_header, BRW_REGISTER_TYPE_UW), + msg_control, + surf_index, + nr, + 0, + inst->eot, + inst->last_rt, + inst->header_size != 0); - brw_fb_WRITE(p, - payload, - implied_header, - msg_control, - surf_index, - nr, - 0, - inst->eot, - last_render_target, - inst->header_size != 0); - - brw_mark_surface_used(&prog_data->base, surf_index); + if (devinfo->gen >= 6) + brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16); } void fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) { - struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); - const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key; - struct brw_reg implied_header; - if (devinfo->gen < 8 && !devinfo->is_haswell) { brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_default_flag_reg(p, 0, 0); } + const struct brw_reg implied_header = + devinfo->gen < 6 ? payload : brw_null_reg(); + if (inst->base_mrf >= 0) payload = brw_message_reg(inst->base_mrf); - /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied - * move, here's g1. - */ - if (inst->header_size != 0) { - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_exec_size(p, BRW_EXECUTE_1); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_set_default_flag_reg(p, 0, 0); - - /* On HSW, the GPU will use the predicate on SENDC, unless the header is - * present. - */ - if (prog_data->uses_kill) { - struct brw_reg pixel_mask; - - if (devinfo->gen >= 6) - pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); - else - pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); - - brw_MOV(p, pixel_mask, brw_flag_reg(0, 1)); - } - - if (devinfo->gen >= 6) { - brw_push_insn_state(p); - brw_set_default_exec_size(p, BRW_EXECUTE_16); - brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); - brw_MOV(p, - retype(payload, BRW_REGISTER_TYPE_UD), - retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - brw_pop_insn_state(p); - - if (inst->target > 0 && key->replicate_alpha) { - /* Set "Source0 Alpha Present to RenderTarget" bit in message - * header. - */ - brw_OR(p, - vec1(retype(payload, BRW_REGISTER_TYPE_UD)), - vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), - brw_imm_ud(0x1 << 11)); - } - - if (inst->target > 0) { - /* Set the render target index for choosing BLEND_STATE. */ - brw_MOV(p, retype(vec1(suboffset(payload, 2)), - BRW_REGISTER_TYPE_UD), - brw_imm_ud(inst->target)); - } - - /* Set computes stencil to render target */ - if (prog_data->computed_stencil) { - brw_OR(p, - vec1(retype(payload, BRW_REGISTER_TYPE_UD)), - vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), - brw_imm_ud(0x1 << 14)); - } - - implied_header = brw_null_reg(); - } else { - implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); - } - - brw_pop_insn_state(p); - } else { - implied_header = brw_null_reg(); - } - if (!runtime_check_aads_emit) { fire_fb_write(inst, payload, implied_header, inst->mlen); } else { @@ -431,8 +448,6 @@ fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, gen9_fb_READ(p, dst, payload, surf_index, inst->header_size, inst->size_written / REG_SIZE, prog_data->persample_dispatch); - - brw_mark_surface_used(&prog_data->base, surf_index); } void @@ -453,7 +468,15 @@ fs_generator::generate_mov_indirect(fs_inst *inst, reg.nr = imm_byte_offset / REG_SIZE; reg.subnr = imm_byte_offset % REG_SIZE; - brw_MOV(p, dst, reg); + if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) { + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(reg, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(reg, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, reg); + } } else { /* Prior to Broadwell, there are only 8 address registers. */ assert(inst->exec_size <= 8 || devinfo->gen >= 8); @@ -461,6 +484,13 @@ fs_generator::generate_mov_indirect(fs_inst *inst, /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ struct brw_reg addr = vec8(brw_address_reg(0)); + /* Whether we can use destination dependency control without running the + * risk of a hang if an instruction gets shot down. + */ + const bool use_dep_ctrl = !inst->predicate && + inst->exec_size == dispatch_width; + brw_inst *insn; + /* The destination stride of an instruction (in bytes) must be greater * than or equal to the size of the rest of the instruction. Since the * address register is of type UW, we can't use a D-type instruction. @@ -493,12 +523,34 @@ fs_generator::generate_mov_indirect(fs_inst *inst, * In the end, while base_offset is nice to look at in the generated * code, using it saves us 0 instructions and would require quite a bit * of case-by-case work. It's just not worth it. + * + * Due to a hardware bug some platforms (particularly Gen11+) seem to + * require the address components of all channels to be valid whether or + * not they're active, which causes issues if we use VxH addressing + * under non-uniform control-flow. We can easily work around that by + * initializing the whole address register with a pipelined NoMask MOV + * instruction. */ - brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 7) { + insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); + } + + insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + else if (devinfo->gen >= 7) + brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); if (type_sz(reg.type) > 4 && ((devinfo->gen == 7 && !devinfo->is_haswell) || - devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { + devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) || + !devinfo->has_64bit_float)) { /* IVB has an issue (which we found empirically) where it reads two * address register components per channel for indirectly addressed * 64-bit sources. @@ -516,6 +568,7 @@ fs_generator::generate_mov_indirect(fs_inst *inst, */ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); } else { @@ -540,6 +593,181 @@ fs_generator::generate_mov_indirect(fs_inst *inst, } } +void +fs_generator::generate_shuffle(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx) +{ + /* Ivy bridge has some strange behavior that makes this a real pain to + * implement for 64-bit values so we just don't bother. + */ + assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4); + + /* Because we're using the address register, we're limited to 8-wide + * execution on gen7. On gen8, we're limited to 16-wide by the address + * register file and 8-wide for 64-bit types. We could try and make this + * instruction splittable higher up in the compiler but that gets weird + * because it reads all of the channels regardless of execution size. It's + * easier just to split it here. + */ + const unsigned lower_width = + (devinfo->gen <= 7 || type_sz(src.type) > 4) ? + 8 : MIN2(16, inst->exec_size); + + brw_set_default_exec_size(p, cvt(lower_width) - 1); + for (unsigned group = 0; group < inst->exec_size; group += lower_width) { + brw_set_default_group(p, group); + + if ((src.vstride == 0 && src.hstride == 0) || + idx.file == BRW_IMMEDIATE_VALUE) { + /* Trivial, the source is already uniform or the index is a constant. + * We will typically not get here if the optimizer is doing its job, + * but asserting would be mean. + */ + const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; + brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0)); + } else { + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); + + struct brw_reg group_idx = suboffset(idx, group); + + if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) { + /* Things get grumpy if the register is too wide. */ + group_idx.width--; + group_idx.vstride--; + } + + assert(type_sz(group_idx.type) <= 4); + if (type_sz(group_idx.type) == 4) { + /* The destination stride of an instruction (in bytes) must be + * greater than or equal to the size of the rest of the + * instruction. Since the address register is of type UW, we + * can't use a D-type instruction. In order to get around this, + * re retype to UW and use a stride. + */ + group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W); + } + + /* Take into account the component size and horizontal stride. */ + assert(src.vstride == src.hstride + src.width); + brw_SHL(p, addr, group_idx, + brw_imm_uw(util_logbase2(type_sz(src.type)) + + src.hstride - 1)); + + /* Add on the register start offset */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr)); + + if (type_sz(src.type) > 4 && + ((devinfo->gen == 7 && !devinfo->is_haswell) || + devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { + /* IVB has an issue (which we found empirically) where it reads + * two address register components per channel for indirectly + * addressed 64-bit sources. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * To work around both of these, we do two integer MOVs insead of + * one 64-bit MOV. Because no double value should ever cross a + * register boundary, it's safe to use the immediate offset in the + * indirect here to handle adding 4 bytes to the offset and avoid + * the extra ADD to the register file. + */ + struct brw_reg gdst = suboffset(dst, group); + struct brw_reg dst_d = retype(spread(gdst, 2), + BRW_REGISTER_TYPE_D); + assert(dst.hstride == 1); + brw_MOV(p, dst_d, + retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, byte_offset(dst_d, 4), + retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); + } else { + brw_MOV(p, suboffset(dst, group * dst.hstride), + retype(brw_VxH_indirect(0, 0), src.type)); + } + } + + brw_set_default_swsb(p, tgl_swsb_null()); + } +} + +void +fs_generator::generate_quad_swizzle(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src, + unsigned swiz) +{ + /* Requires a quad. */ + assert(inst->exec_size >= 4); + + if (src.file == BRW_IMMEDIATE_VALUE || + has_scalar_region(src)) { + /* The value is uniform across all channels */ + brw_MOV(p, dst, src); + + } else if (devinfo->gen < 11 && type_sz(src.type) == 4) { + /* This only works on 8-wide 32-bit values */ + assert(inst->exec_size == 8); + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src.vstride == src.width + 1); + brw_set_default_access_mode(p, BRW_ALIGN_16); + struct brw_reg swiz_src = stride(src, 4, 4, 1); + swiz_src.swizzle = swiz; + brw_MOV(p, dst, swiz_src); + + } else { + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src.vstride == src.width + 1); + const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0)); + + switch (swiz) { + case BRW_SWIZZLE_XXXX: + case BRW_SWIZZLE_YYYY: + case BRW_SWIZZLE_ZZZZ: + case BRW_SWIZZLE_WWWW: + brw_MOV(p, dst, stride(src_0, 4, 4, 0)); + break; + + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + brw_MOV(p, dst, stride(src_0, 2, 2, 0)); + break; + + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_ZWZW: + assert(inst->exec_size == 4); + brw_MOV(p, dst, stride(src_0, 0, 2, 1)); + break; + + default: + assert(inst->force_writemask_all); + brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); + + for (unsigned c = 0; c < 4; c++) { + brw_inst *insn = brw_MOV( + p, stride(suboffset(dst, c), + 4 * inst->dst.stride, 1, 4 * inst->dst.stride), + stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); + + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(devinfo, insn, c < 3); + brw_inst_set_no_dd_check(devinfo, insn, c > 0); + } + + brw_set_default_swsb(p, tgl_swsb_null()); + } + + break; + } + } +} + void fs_generator::generate_urb_read(fs_inst *inst, struct brw_reg dst, @@ -552,7 +780,8 @@ fs_generator::generate_urb_read(fs_inst *inst, brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); brw_set_src0(p, send, header); - brw_set_src1(p, send, brw_imm_ud(0u)); + if (devinfo->gen < 12) + brw_set_src1(p, send, brw_imm_ud(0u)); brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ); @@ -588,7 +817,8 @@ fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) brw_set_dest(p, insn, brw_null_reg()); brw_set_src0(p, insn, payload); - brw_set_src1(p, insn, brw_imm_d(0)); + if (devinfo->gen < 12) + brw_set_src1(p, insn, brw_imm_ud(0u)); brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE); @@ -617,7 +847,8 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); - brw_set_src1(p, insn, brw_imm_d(0)); + if (devinfo->gen < 12) + brw_set_src1(p, insn, brw_imm_ud(0u)); /* Terminate a compute shader by sending a message to the thread spawner. */ @@ -628,22 +859,30 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) brw_inst_set_header_present(devinfo, insn, false); brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ - brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ - /* Note that even though the thread has a URB resource associated with it, - * we set the "do not dereference URB" bit, because the URB resource is - * managed by the fixed-function unit, so it will free it automatically. - */ - brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + if (devinfo->gen < 11) { + brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ + + /* Note that even though the thread has a URB resource associated with it, + * we set the "do not dereference URB" bit, because the URB resource is + * managed by the fixed-function unit, so it will free it automatically. + */ + brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ + } brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); } void -fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src) +fs_generator::generate_barrier(fs_inst *, struct brw_reg src) { brw_barrier(p, src); - brw_WAIT(p); + if (devinfo->gen >= 12) { + brw_set_default_swsb(p, tgl_swsb_null()); + brw_SYNC(p, TGL_SYNC_BAR); + } else { + brw_WAIT(p); + } } bool @@ -673,55 +912,69 @@ fs_generator::generate_linterp(fs_inst *inst, struct brw_reg delta_x = src[0]; struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); struct brw_reg interp = src[1]; - brw_inst *i[4]; - - if (devinfo->gen >= 11) { - struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_NF); - struct brw_reg dwP = suboffset(interp, 0); - struct brw_reg dwQ = suboffset(interp, 1); - struct brw_reg dwR = suboffset(interp, 3); + brw_inst *i[2]; - brw_set_default_exec_size(p, BRW_EXECUTE_8); + /* nir_lower_interpolation() will do the lowering to MAD instructions for + * us on gen11+ + */ + assert(devinfo->gen < 11); - if (inst->exec_size == 8) { - i[0] = brw_MAD(p, acc, dwR, offset(delta_x, 0), dwP); - i[1] = brw_MAD(p, offset(dst, 0), acc, offset(delta_y, 0), dwQ); + if (devinfo->has_pln) { + if (devinfo->gen <= 6 && (delta_x.nr & 1) != 0) { + /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane": + * + * "[DevSNB]: must be even register aligned. + * + * This restriction is lifted on Ivy Bridge. + * + * This means that we need to split PLN into LINE+MAC on-the-fly. + * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so + * we have to split into SIMD8 pieces. For gen4 (!has_pln), the + * coordinate registers are laid out differently so we leave it as a + * SIMD16 instruction. + */ + assert(inst->exec_size == 8 || inst->exec_size == 16); + assert(inst->group % 16 == 0); - brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); + brw_push_insn_state(p); + brw_set_default_exec_size(p, BRW_EXECUTE_8); - /* brw_set_default_saturate() is called before emitting instructions, - * so the saturate bit is set in each instruction, so we need to unset - * it on the first instruction of each pair. + /* Thanks to two accumulators, we can emit all the LINEs and then all + * the MACs. This improves parallelism a bit. */ - brw_inst_set_saturate(p->devinfo, i[0], false); - } else { - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - i[0] = brw_MAD(p, acc, dwR, offset(delta_x, 0), dwP); - i[1] = brw_MAD(p, offset(dst, 0), acc, offset(delta_x, 1), dwQ); + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + brw_inst *line = brw_LINE(p, brw_null_reg(), interp, + offset(delta_x, g * 2)); + brw_inst_set_group(devinfo, line, inst->group + g * 8); - brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); - i[2] = brw_MAD(p, acc, dwR, offset(delta_y, 0), dwP); - i[3] = brw_MAD(p, offset(dst, 1), acc, offset(delta_y, 1), dwQ); + /* LINE writes the accumulator automatically on gen4-5. On Sandy + * Bridge and later, we have to explicitly enable it. + */ + if (devinfo->gen >= 6) + brw_inst_set_acc_wr_control(p->devinfo, line, true); - brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); + /* brw_set_default_saturate() is called before emitting + * instructions, so the saturate bit is set in each instruction, + * so we need to unset it on the LINE instructions. + */ + brw_inst_set_saturate(p->devinfo, line, false); + } - brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); - brw_inst_set_cond_modifier(p->devinfo, i[3], inst->conditional_mod); + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1), + offset(delta_x, g * 2 + 1)); + brw_inst_set_group(devinfo, mac, inst->group + g * 8); + brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod); + } - /* brw_set_default_saturate() is called before emitting instructions, - * so the saturate bit is set in each instruction, so we need to unset - * it on the first instruction of each pair. - */ - brw_inst_set_saturate(p->devinfo, i[0], false); - brw_inst_set_saturate(p->devinfo, i[2], false); - } + brw_pop_insn_state(p); - return true; - } else if (devinfo->has_pln && - (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) { - brw_PLN(p, dst, interp, delta_x); + return true; + } else { + brw_PLN(p, dst, interp, delta_x); - return false; + return false; + } } else { i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x); i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y); @@ -778,20 +1031,23 @@ fs_generator::generate_get_buffer_size(fs_inst *inst, inst->header_size > 0, simd_mode, BRW_SAMPLER_RETURN_FORMAT_SINT32); - - brw_mark_surface_used(prog_data, surf_index.ud); } void -fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, +fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg surface_index, struct brw_reg sampler_index) { + assert(devinfo->gen < 7); assert(inst->size_written % REG_SIZE == 0); int msg_type = -1; uint32_t simd_mode; uint32_t return_format; - bool is_combined_send = inst->eot; + + /* Sampler EOT message of less than the dispatch width would kill the + * thread prematurely. + */ + assert(!inst->eot || inst->exec_size == dispatch_width); switch (dst.type) { case BRW_REGISTER_TYPE_D: @@ -850,70 +1106,26 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; } break; - case SHADER_OPCODE_TXL_LZ: - assert(devinfo->gen >= 9); - if (inst->shadow_compare) { - msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ; - } else { - msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ; - } - break; case SHADER_OPCODE_TXS: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; break; case SHADER_OPCODE_TXD: - if (inst->shadow_compare) { - /* Gen7.5+. Otherwise, lowered in NIR */ - assert(devinfo->gen >= 8 || devinfo->is_haswell); - msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; - } else { - msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; - } + assert(!inst->shadow_compare); + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; break; case SHADER_OPCODE_TXF: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; - case SHADER_OPCODE_TXF_LZ: - assert(devinfo->gen >= 9); - msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; - break; - case SHADER_OPCODE_TXF_CMS_W: - assert(devinfo->gen >= 9); - msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; - break; case SHADER_OPCODE_TXF_CMS: - if (devinfo->gen >= 7) - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; - else - msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; - break; - case SHADER_OPCODE_TXF_UMS: - assert(devinfo->gen >= 7); - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; - break; - case SHADER_OPCODE_TXF_MCS: - assert(devinfo->gen >= 7); - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; + msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; case SHADER_OPCODE_LOD: msg_type = GEN5_SAMPLER_MESSAGE_LOD; break; case SHADER_OPCODE_TG4: - if (inst->shadow_compare) { - assert(devinfo->gen >= 7); - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; - } else { - assert(devinfo->gen >= 6); - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; - } - break; - case SHADER_OPCODE_TG4_OFFSET: - assert(devinfo->gen >= 7); - if (inst->shadow_compare) { - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; - } else { - msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; - } + assert(devinfo->gen == 6); + assert(!inst->shadow_compare); + msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; break; case SHADER_OPCODE_SAMPLEINFO: msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; @@ -992,29 +1204,30 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src dst = vec16(dst); } - assert(devinfo->gen < 7 || inst->header_size == 0 || - src.file == BRW_GENERAL_REGISTER_FILE); - assert(sampler_index.type == BRW_REGISTER_TYPE_UD); /* Load the message header if present. If there's a texture offset, * we need to set it up explicitly and load the offset bitfield. * Otherwise, we can use an implied move from g0 to the first message reg. */ - if (inst->header_size != 0 && devinfo->gen < 7) { + struct brw_reg src = brw_null_reg(); + if (inst->header_size != 0) { if (devinfo->gen < 6 && !inst->offset) { /* Set up an implied move from g0 to the MRF. */ src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); } else { + const tgl_swsb swsb = brw_get_default_swsb(p); assert(inst->base_mrf != -1); struct brw_reg header_reg = brw_message_reg(inst->base_mrf); brw_push_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_set_default_exec_size(p, BRW_EXECUTE_8); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); /* Explicitly set up the message header by copying g0 to the MRF. */ brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_set_default_exec_size(p, BRW_EXECUTE_1); if (inst->offset) { @@ -1024,83 +1237,35 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src } brw_pop_insn_state(p); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); } } - uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || - inst->opcode == SHADER_OPCODE_TG4_OFFSET) - ? prog_data->binding_table.gather_texture_start - : prog_data->binding_table.texture_start; - - if (surface_index.file == BRW_IMMEDIATE_VALUE && - sampler_index.file == BRW_IMMEDIATE_VALUE) { - uint32_t surface = surface_index.ud; - uint32_t sampler = sampler_index.ud; - - brw_SAMPLE(p, - retype(dst, BRW_REGISTER_TYPE_UW), - inst->base_mrf, - src, - surface + base_binding_table_index, - sampler % 16, - msg_type, - inst->size_written / REG_SIZE, - inst->mlen, - inst->header_size != 0, - simd_mode, - return_format); - - brw_mark_surface_used(prog_data, surface + base_binding_table_index); - } else { - /* Non-const sampler index */ - - struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); - struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); - struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_exec_size(p, BRW_EXECUTE_1); - - if (brw_regs_equal(&surface_reg, &sampler_reg)) { - brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); - } else { - if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { - brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); - } else { - brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); - brw_OR(p, addr, addr, surface_reg); - } - } - if (base_binding_table_index) - brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); - brw_AND(p, addr, addr, brw_imm_ud(0xfff)); - - brw_pop_insn_state(p); - - /* dst = send(offset, a0.0 | ) */ - brw_inst *insn = brw_send_indirect_message( - p, BRW_SFID_SAMPLER, dst, src, addr); - brw_set_sampler_message(p, insn, - 0 /* surface */, - 0 /* sampler */, - msg_type, - inst->size_written / REG_SIZE, - inst->mlen /* mlen */, - inst->header_size != 0 /* header */, - simd_mode, - return_format); - - /* visitor knows more than we do about the surface limit required, - * so has already done marking. - */ + uint32_t base_binding_table_index; + switch (inst->opcode) { + case SHADER_OPCODE_TG4: + base_binding_table_index = prog_data->binding_table.gather_texture_start; + break; + default: + base_binding_table_index = prog_data->binding_table.texture_start; + break; } - if (is_combined_send) { - brw_inst_set_eot(p->devinfo, brw_last_inst, true); - brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); - } + assert(surface_index.file == BRW_IMMEDIATE_VALUE); + assert(sampler_index.file == BRW_IMMEDIATE_VALUE); + + brw_SAMPLE(p, + retype(dst, BRW_REGISTER_TYPE_UW), + inst->base_mrf, + src, + surface_index.ud + base_binding_table_index, + sampler_index.ud % 16, + msg_type, + inst->size_written / REG_SIZE, + inst->mlen, + inst->header_size != 0, + simd_mode, + return_format); } @@ -1138,28 +1303,50 @@ fs_generator::generate_ddx(const fs_inst *inst, { unsigned vstride, width; - if (inst->opcode == FS_OPCODE_DDX_FINE) { - /* produce accurate derivatives */ - vstride = BRW_VERTICAL_STRIDE_2; - width = BRW_WIDTH_2; - } else { - /* replicate the derivative at the top-left pixel to other pixels */ - vstride = BRW_VERTICAL_STRIDE_4; - width = BRW_WIDTH_4; - } + if (devinfo->gen >= 8) { + if (inst->opcode == FS_OPCODE_DDX_FINE) { + /* produce accurate derivatives */ + vstride = BRW_VERTICAL_STRIDE_2; + width = BRW_WIDTH_2; + } else { + /* replicate the derivative at the top-left pixel to other pixels */ + vstride = BRW_VERTICAL_STRIDE_4; + width = BRW_WIDTH_4; + } + + struct brw_reg src0 = byte_offset(src, type_sz(src.type));; + struct brw_reg src1 = src; - struct brw_reg src0 = src; - struct brw_reg src1 = src; + src0.vstride = vstride; + src0.width = width; + src0.hstride = BRW_HORIZONTAL_STRIDE_0; + src1.vstride = vstride; + src1.width = width; + src1.hstride = BRW_HORIZONTAL_STRIDE_0; - src0.subnr = sizeof(float); - src0.vstride = vstride; - src0.width = width; - src0.hstride = BRW_HORIZONTAL_STRIDE_0; - src1.vstride = vstride; - src1.width = width; - src1.hstride = BRW_HORIZONTAL_STRIDE_0; + brw_ADD(p, dst, src0, negate(src1)); + } else { + /* On Haswell and earlier, the region used above appears to not work + * correctly for compressed instructions. At least on Haswell and + * Iron Lake, compressed ALIGN16 instructions do work. Since we + * would have to split to SIMD8 no matter which method we choose, we + * may as well use ALIGN16 on all platforms gen7 and earlier. + */ + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + if (inst->opcode == FS_OPCODE_DDX_FINE) { + src0.swizzle = BRW_SWIZZLE_XXZZ; + src1.swizzle = BRW_SWIZZLE_YYWW; + } else { + src0.swizzle = BRW_SWIZZLE_XXXX; + src1.swizzle = BRW_SWIZZLE_YYYY; + } - brw_ADD(p, dst, src0, negate(src1)); + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } } /* The negate_value boolean is used to negate the derivative computation for @@ -1170,35 +1357,35 @@ void fs_generator::generate_ddy(const fs_inst *inst, struct brw_reg dst, struct brw_reg src) { + const uint32_t type_size = type_sz(src.type); + if (inst->opcode == FS_OPCODE_DDY_FINE) { - /* produce accurate derivatives */ - if (devinfo->gen >= 11) { + /* produce accurate derivatives. + * + * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU) + * "Register Region Restrictions", Section "1. Special Restrictions": + * + * "In Align16 mode, the channel selects and channel enables apply to + * a pair of half-floats, because these parameters are defined for + * DWord elements ONLY. This is applicable when both source and + * destination are half-floats." + * + * So for half-float operations we use the Gen11+ Align1 path. CHV + * inherits its FP16 hardware from SKL, so it is not affected. + */ + if (devinfo->gen >= 11 || + (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) { src = stride(src, 0, 2, 1); - struct brw_reg src_0 = byte_offset(src, 0 * sizeof(float)); - struct brw_reg src_2 = byte_offset(src, 2 * sizeof(float)); - struct brw_reg src_4 = byte_offset(src, 4 * sizeof(float)); - struct brw_reg src_6 = byte_offset(src, 6 * sizeof(float)); - struct brw_reg src_8 = byte_offset(src, 8 * sizeof(float)); - struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float)); - struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float)); - struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float)); - - struct brw_reg dst_0 = byte_offset(dst, 0 * sizeof(float)); - struct brw_reg dst_4 = byte_offset(dst, 4 * sizeof(float)); - struct brw_reg dst_8 = byte_offset(dst, 8 * sizeof(float)); - struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float)); brw_push_insn_state(p); brw_set_default_exec_size(p, BRW_EXECUTE_4); - - brw_ADD(p, dst_0, negate(src_0), src_2); - brw_ADD(p, dst_4, negate(src_4), src_6); - - if (inst->exec_size == 16) { - brw_ADD(p, dst_8, negate(src_8), src_10); - brw_ADD(p, dst_12, negate(src_12), src_14); + for (uint32_t g = 0; g < inst->exec_size; g += 4) { + brw_set_default_group(p, inst->group + g); + brw_ADD(p, byte_offset(dst, g * type_size), + negate(byte_offset(src, g * type_size)), + byte_offset(src, (g + 2) * type_size)); + brw_set_default_swsb(p, tgl_swsb_null()); } - brw_pop_insn_state(p); } else { struct brw_reg src0 = stride(src, 4, 4, 1); @@ -1213,26 +1400,40 @@ fs_generator::generate_ddy(const fs_inst *inst, } } else { /* replicate the derivative at the top-left pixel to other pixels */ - struct brw_reg src0 = stride(src, 4, 4, 0); - struct brw_reg src1 = stride(src, 4, 4, 0); - src0.subnr = 0 * sizeof(float); - src1.subnr = 2 * sizeof(float); + if (devinfo->gen >= 8) { + struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size); + struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size); - brw_ADD(p, dst, negate(src0), src1); + brw_ADD(p, dst, negate(src0), src1); + } else { + /* On Haswell and earlier, the region used above appears to not work + * correctly for compressed instructions. At least on Haswell and + * Iron Lake, compressed ALIGN16 instructions do work. Since we + * would have to split to SIMD8 no matter which method we choose, we + * may as well use ALIGN16 on all platforms gen7 and earlier. + */ + struct brw_reg src0 = stride(src, 4, 4, 1); + struct brw_reg src1 = stride(src, 4, 4, 1); + src0.swizzle = BRW_SWIZZLE_XXXX; + src1.swizzle = BRW_SWIZZLE_ZZZZ; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_ADD(p, dst, negate(src0), src1); + brw_pop_insn_state(p); + } } } void -fs_generator::generate_discard_jump(fs_inst *inst) +fs_generator::generate_discard_jump(fs_inst *) { - assert(devinfo->gen >= 6); - /* This HALT will be patched up at FB write time to point UIP at the end of * the program, and at brw_uip_jip() JIP will be set to the end of the * current block (or the program). */ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); - gen6_HALT(p); + brw_HALT(p); } void @@ -1246,6 +1447,7 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : MIN2(16, inst->exec_size); const unsigned block_size = 4 * lower_size / REG_SIZE; + const tgl_swsb swsb = brw_get_default_swsb(p); assert(inst->mlen != 0); brw_push_insn_state(p); @@ -1255,9 +1457,17 @@ fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { brw_set_default_group(p, inst->group + lower_size * i); + if (i > 0) { + assert(swsb.mode & TGL_SBID_SET); + brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid)); + } else { + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); + } + brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), block_size, inst->offset + block_size * REG_SIZE * i); @@ -1323,23 +1533,26 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_pop_insn_state(p); + brw_inst_set_sfid(devinfo, send, GEN6_SFID_DATAPORT_CONSTANT_CACHE); brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); - brw_set_dp_read_message(p, send, surf_index, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), - GEN7_DATAPORT_DC_OWORD_BLOCK_READ, - GEN6_SFID_DATAPORT_CONSTANT_CACHE, - 1, /* mlen */ - true, /* header */ - DIV_ROUND_UP(inst->size_written, REG_SIZE)); + brw_set_desc(p, send, + brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written, + REG_SIZE), true) | + brw_dp_read_desc(devinfo, surf_index, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), + GEN7_DATAPORT_DC_OWORD_BLOCK_READ, + BRW_DATAPORT_READ_TARGET_DATA_CACHE)); } else { + const tgl_swsb swsb = brw_get_default_swsb(p); struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); /* a0.0 = surf_index & 0xff */ + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); brw_set_dest(p, insn_and, addr); @@ -1347,17 +1560,18 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); /* dst = send(payload, a0.0 | ) */ - brw_inst *insn = brw_send_indirect_message( + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); + brw_send_indirect_message( p, GEN6_SFID_DATAPORT_CONSTANT_CACHE, retype(dst, BRW_REGISTER_TYPE_UD), - retype(payload, BRW_REGISTER_TYPE_UD), addr); - brw_set_dp_read_message(p, insn, 0 /* surface */, - BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), - GEN7_DATAPORT_DC_OWORD_BLOCK_READ, - GEN6_SFID_DATAPORT_CONSTANT_CACHE, - 1, /* mlen */ - true, /* header */ - DIV_ROUND_UP(inst->size_written, REG_SIZE)); + retype(payload, BRW_REGISTER_TYPE_UD), addr, + brw_message_desc(devinfo, 1, + DIV_ROUND_UP(inst->size_written, REG_SIZE), true) | + brw_dp_read_desc(devinfo, 0 /* surface */, + BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), + GEN7_DATAPORT_DC_OWORD_BLOCK_READ, + BRW_DATAPORT_READ_TARGET_DATA_CACHE), + false /* EOT */); brw_pop_insn_state(p); } @@ -1404,6 +1618,7 @@ fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst, brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_inst_set_compression(devinfo, send, false); + brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER); brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); brw_set_src0(p, send, header); if (devinfo->gen < 6) @@ -1413,114 +1628,11 @@ fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst, * stored in it. */ uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; - brw_set_sampler_message(p, send, - surf_index, - 0, /* sampler (unused) */ - msg_type, - rlen, - inst->mlen, - inst->header_size != 0, - simd_mode, - return_format); -} - -void -fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, - struct brw_reg dst, - struct brw_reg index, - struct brw_reg offset) -{ - assert(devinfo->gen >= 7); - /* Varying-offset pull constant loads are treated as a normal expression on - * gen7, so the fact that it's a send message is hidden at the IR level. - */ - assert(inst->header_size == 0); - assert(!inst->mlen); - assert(index.type == BRW_REGISTER_TYPE_UD); - - uint32_t simd_mode, rlen, mlen; - if (inst->exec_size == 16) { - mlen = 2; - rlen = 8; - simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; - } else { - assert(inst->exec_size == 8); - mlen = 1; - rlen = 4; - simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; - } - - if (index.file == BRW_IMMEDIATE_VALUE) { - - uint32_t surf_index = index.ud; - - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); - brw_set_src0(p, send, offset); - brw_set_sampler_message(p, send, - surf_index, - 0, /* LD message ignores sampler unit */ - GEN5_SAMPLER_MESSAGE_SAMPLE_LD, - rlen, - mlen, - false, /* no header */ - simd_mode, - 0); - - } else { - - struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - /* a0.0 = surf_index & 0xff */ - brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); - brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); - brw_set_dest(p, insn_and, addr); - brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); - brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); - - brw_pop_insn_state(p); - - /* dst = send(offset, a0.0 | ) */ - brw_inst *insn = brw_send_indirect_message( - p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW), - offset, addr); - brw_set_sampler_message(p, insn, - 0 /* surface */, - 0 /* sampler */, - GEN5_SAMPLER_MESSAGE_SAMPLE_LD, - rlen /* rlen */, - mlen /* mlen */, - false /* header */, - simd_mode, - 0); - } -} - -/** - * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred - * into the flags register (f0.0). - * - * Used only on Gen6 and above. - */ -void -fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst) -{ - struct brw_reg flags = brw_flag_subreg(inst->flag_subreg); - struct brw_reg dispatch_mask; - - if (devinfo->gen >= 6) - dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); - else - dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_exec_size(p, BRW_EXECUTE_1); - brw_MOV(p, flags, dispatch_mask); - brw_pop_insn_state(p); + brw_set_desc(p, send, + brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) | + brw_sampler_desc(devinfo, surf_index, + 0, /* sampler (unused) */ + msg_type, simd_mode, return_format)); } void @@ -1530,16 +1642,18 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst, struct brw_reg msg_data, unsigned msg_type) { - assert(inst->size_written % REG_SIZE == 0); + const bool has_payload = inst->src[0].file != BAD_FILE; assert(msg_data.type == BRW_REGISTER_TYPE_UD); + assert(inst->size_written % REG_SIZE == 0); brw_pixel_interpolator_query(p, retype(dst, BRW_REGISTER_TYPE_UW), - src, + /* If we don't have a payload, what we send doesn't matter */ + has_payload ? src : brw_vec8_grf(0, 0), inst->pi_noperspective, msg_type, msg_data, - inst->mlen, + has_payload ? 2 * inst->exec_size / 8 : 1, inst->size_written / REG_SIZE); } @@ -1557,22 +1671,25 @@ fs_generator::generate_set_sample_id(fs_inst *inst, assert(src0.type == BRW_REGISTER_TYPE_D || src0.type == BRW_REGISTER_TYPE_UD); - struct brw_reg reg = stride(src1, 1, 4, 0); - if (devinfo->gen >= 8 || inst->exec_size == 8) { - brw_ADD(p, dst, src0, reg); - } else if (inst->exec_size == 16) { - brw_push_insn_state(p); - brw_set_default_exec_size(p, BRW_EXECUTE_8); - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); - brw_ADD(p, firsthalf(dst), firsthalf(src0), reg); - brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); - brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2)); - brw_pop_insn_state(p); + const struct brw_reg reg = stride(src1, 1, 4, 0); + const unsigned lower_size = MIN2(inst->exec_size, + devinfo->gen >= 8 ? 16 : 8); + + for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { + brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8), + offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) * + (i * lower_size / (1 << src0.width))) * + type_sz(src0.type) / REG_SIZE), + suboffset(reg, i * lower_size / 4)); + brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1); + brw_inst_set_group(devinfo, insn, inst->group + lower_size * i); + brw_inst_set_compression(devinfo, insn, lower_size > 8); + brw_set_default_swsb(p, tgl_swsb_null()); } } void -fs_generator::generate_pack_half_2x16_split(fs_inst *inst, +fs_generator::generate_pack_half_2x16_split(fs_inst *, struct brw_reg dst, struct brw_reg x, struct brw_reg y) @@ -1602,6 +1719,7 @@ fs_generator::generate_pack_half_2x16_split(fs_inst *inst, /* Now the form: * 0xhhhh0000 */ + brw_set_default_swsb(p, tgl_swsb_regdist(1)); brw_SHL(p, dst, dst, brw_imm_ud(16u)); /* And, finally the form of packHalf2x16's output: @@ -1611,43 +1729,17 @@ fs_generator::generate_pack_half_2x16_split(fs_inst *inst, } void -fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, - struct brw_reg dst, - struct brw_reg src) -{ - assert(devinfo->gen >= 7); - assert(dst.type == BRW_REGISTER_TYPE_F); - assert(src.type == BRW_REGISTER_TYPE_UD); - - /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: - * - * Because this instruction does not have a 16-bit floating-point type, - * the source data type must be Word (W). The destination type must be - * F (Float). - */ - struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2); - - /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll. - * For the Y case, we wish to access only the upper word; therefore - * a 16-bit subregister offset is needed. - */ - assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X || - inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y); - if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y) - src_w.subnr += 2; - - brw_F16TO32(p, dst, src_w); -} - -void -fs_generator::generate_shader_time_add(fs_inst *inst, +fs_generator::generate_shader_time_add(fs_inst *, struct brw_reg payload, struct brw_reg offset, struct brw_reg value) { + const tgl_swsb swsb = brw_get_default_swsb(p); + assert(devinfo->gen >= 7); brw_push_insn_state(p); brw_set_default_mask_control(p, true); + brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); assert(payload.file == BRW_GENERAL_REGISTER_FILE); struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), @@ -1669,13 +1761,12 @@ fs_generator::generate_shader_time_add(fs_inst *inst, * out of this path, so we just emit the MOVs from here. */ brw_MOV(p, payload_offset, offset); + brw_set_default_swsb(p, tgl_swsb_null()); brw_MOV(p, payload_value, value); + brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); brw_shader_time_add(p, payload, prog_data->binding_table.shader_time_start); brw_pop_insn_state(p); - - brw_mark_surface_used(prog_data, - prog_data->binding_table.shader_time_start); } void @@ -1686,22 +1777,35 @@ fs_generator::enable_debug(const char *shader_name) } int -fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) +fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, + struct shader_stats shader_stats, + const brw::performance &perf, + struct brw_compile_stats *stats) { /* align to 64 byte boundary. */ - while (p->next_insn_offset % 64) - brw_NOP(p); + brw_realign(p, 64); this->dispatch_width = dispatch_width; int start_offset = p->next_insn_offset; + + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ int spill_count = 0, fill_count = 0; - int loop_count = 0; + int loop_count = 0, send_count = 0, nop_count = 0; + bool is_accum_used = false; struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); foreach_block_and_inst (block, fs_inst, inst, cfg) { - struct brw_reg src[3], dst; + if (inst->opcode == SHADER_OPCODE_UNDEF) + continue; + + struct brw_reg src[4], dst; unsigned int last_insn_offset = p->next_insn_offset; bool multiple_instructions_emitted = false; @@ -1722,6 +1826,29 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) inst->dst.component_size(inst->exec_size) > REG_SIZE) { brw_NOP(p); last_insn_offset = p->next_insn_offset; + + /* In order to avoid spurious instruction count differences when the + * instruction schedule changes, keep track of the number of inserted + * NOPs. + */ + nop_count++; + } + + /* GEN:BUG:14010017096: + * + * Clear accumulator register before end of thread. + */ + if (inst->eot && is_accum_used && devinfo->gen >= 12) { + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); + brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f)); + last_insn_offset = p->next_insn_offset; + } + + if (!is_accum_used && !inst->eot) { + is_accum_used = inst->writes_accumulator_implicitly(devinfo) || + inst->dst.is_accumulator(); } if (unlikely(debug_flag)) @@ -1764,10 +1891,17 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_predicate_control(p, inst->predicate); brw_set_default_predicate_inverse(p, inst->predicate_inverse); - brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2); + /* On gen7 and above, hardware automatically adds the group onto the + * flag subregister number. On Sandy Bridge and older, we have to do it + * ourselves. + */ + const unsigned flag_subreg = inst->flag_subreg + + (devinfo->gen >= 7 ? 0 : inst->group / 16); + brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2); brw_set_default_saturate(p, inst->saturate); brw_set_default_mask_control(p, inst->force_writemask_all); brw_set_default_acc_write_control(p, inst->writes_accumulator); + brw_set_default_swsb(p, inst->sched); unsigned exec_size = inst->exec_size; if (devinfo->gen == 7 && !devinfo->is_haswell && @@ -1783,6 +1917,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) assert(inst->mlen <= BRW_MAX_MSG_LENGTH); switch (inst->opcode) { + case BRW_OPCODE_SYNC: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + brw_SYNC(p, tgl_sync_function(src[0].ud)); + break; case BRW_OPCODE_MOV: brw_MOV(p, dst, src[0]); break; @@ -1851,6 +1989,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case BRW_OPCODE_SHL: brw_SHL(p, dst, src[0], src[1]); break; + case BRW_OPCODE_ROL: + assert(devinfo->gen >= 11); + assert(src[0].type == dst.type); + brw_ROL(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_ROR: + assert(devinfo->gen >= 11); + assert(src[0].type == dst.type); + brw_ROR(p, dst, src[0], src[1]); + break; case BRW_OPCODE_F32TO16: assert(devinfo->gen >= 7); brw_F32TO16(p, dst, src[0]); @@ -1875,6 +2023,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case BRW_OPCODE_SEL: brw_SEL(p, dst, src[0], src[1]); break; + case BRW_OPCODE_CSEL: + assert(devinfo->gen >= 8); + if (devinfo->gen < 10) + brw_set_default_access_mode(p, BRW_ALIGN_16); + brw_CSEL(p, dst, src[0], src[1], src[2]); + break; case BRW_OPCODE_BFREV: assert(devinfo->gen >= 7); brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), @@ -1933,7 +2087,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) assert(devinfo->gen == 6); gen6_IF(p, inst->conditional_mod, src[0], src[1]); } else { - brw_IF(p, brw_inst_exec_size(devinfo, p->current)); + brw_IF(p, brw_get_default_exec_size(p)); } break; @@ -1945,7 +2099,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) break; case BRW_OPCODE_DO: - brw_DO(p, brw_inst_exec_size(devinfo, p->current)); + brw_DO(p, brw_get_default_exec_size(p)); break; case BRW_OPCODE_BREAK: @@ -1980,6 +2134,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_math_function(inst->opcode), inst->base_mrf, src[0], BRW_MATH_PRECISION_FULL); + send_count++; } break; case SHADER_OPCODE_INT_QUOTIENT: @@ -1997,11 +2152,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) gen4_math(p, dst, brw_math_function(inst->opcode), inst->base_mrf, src[0], BRW_MATH_PRECISION_FULL); + send_count++; } break; - case FS_OPCODE_CINTERP: - brw_MOV(p, dst, src[0]); - break; case FS_OPCODE_LINTERP: multiple_instructions_emitted = generate_linterp(inst, dst, src); break; @@ -2015,27 +2168,40 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) src[0].subnr = 4 * type_sz(src[0].type); brw_MOV(p, dst, stride(src[0], 8, 4, 1)); break; + + case SHADER_OPCODE_SEND: + generate_send(inst, dst, src[0], src[1], src[2], + inst->ex_mlen > 0 ? src[3] : brw_null_reg()); + if ((inst->desc & 0xff) == BRW_BTI_STATELESS || + (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) { + if (inst->size_written) + fill_count++; + else + spill_count++; + } else { + send_count++; + } + break; + case SHADER_OPCODE_GET_BUFFER_SIZE: generate_get_buffer_size(inst, dst, src[0], src[1]); + send_count++; break; case SHADER_OPCODE_TEX: case FS_OPCODE_TXB: case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: case SHADER_OPCODE_TXF_CMS: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_UMS: - case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: case SHADER_OPCODE_TXS: case SHADER_OPCODE_LOD: case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_SAMPLEINFO: - generate_tex(inst, dst, src[0], src[1], src[2]); - break; + assert(inst->src[0].file == BAD_FILE); + generate_tex(inst, dst, src[1], src[2]); + send_count++; + break; + case FS_OPCODE_DDX_COARSE: case FS_OPCODE_DDX_FINE: generate_ddx(inst, dst, src[0]); @@ -2067,6 +2233,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case SHADER_OPCODE_URB_READ_SIMD8: case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: generate_urb_read(inst, dst, src[0]); + send_count++; break; case SHADER_OPCODE_URB_WRITE_SIMD8: @@ -2074,37 +2241,35 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: generate_urb_write(inst, src[0]); + send_count++; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: assert(inst->force_writemask_all); generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: assert(inst->force_writemask_all); generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: generate_varying_pull_constant_load_gen4(inst, dst, src[0]); - break; - - case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: - generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]); + send_count++; break; case FS_OPCODE_REP_FB_WRITE: case FS_OPCODE_FB_WRITE: generate_fb_write(inst, src[0]); + send_count++; break; case FS_OPCODE_FB_READ: generate_fb_read(inst, dst, src[0]); - break; - - case FS_OPCODE_MOV_DISPATCH_TO_FLAGS: - generate_mov_dispatch_to_flags(inst); + send_count++; break; case FS_OPCODE_DISCARD_JUMP: @@ -2115,63 +2280,50 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) generate_shader_time_add(inst, src[0], src[1], src[2]); break; - case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_INTERLOCK: + case SHADER_OPCODE_MEMORY_FENCE: { + assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, - inst->mlen, !inst->dst.is_null(), - inst->header_size); - break; - case SHADER_OPCODE_UNTYPED_SURFACE_READ: - assert(!inst->header_size); - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_surface_read(p, dst, src[0], src[1], - inst->mlen, src[2].ud); - break; + const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? + BRW_OPCODE_SENDC : BRW_OPCODE_SEND; - case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_surface_write(p, src[0], src[1], - inst->mlen, src[2].ud, - inst->header_size); - break; - - case SHADER_OPCODE_BYTE_SCATTERED_READ: - assert(!inst->header_size); - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_byte_scattered_read(p, dst, src[0], src[1], - inst->mlen, src[2].ud); - break; - - case SHADER_OPCODE_BYTE_SCATTERED_WRITE: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_byte_scattered_write(p, src[0], src[1], - inst->mlen, src[2].ud, - inst->header_size); + brw_memory_fence(p, dst, src[0], send_op, + brw_message_target(inst->sfid), + /* commit_enable */ src[1].ud, + /* bti */ src[2].ud); + send_count++; break; + } - case SHADER_OPCODE_TYPED_ATOMIC: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_typed_atomic(p, dst, src[0], src[1], - src[2].ud, inst->mlen, !inst->dst.is_null(), - inst->header_size); - break; + case FS_OPCODE_SCHEDULING_FENCE: + if (inst->sources == 0 && inst->sched.regdist == 0 && + inst->sched.mode == TGL_SBID_NULL) { + if (unlikely(debug_flag)) + disasm_info->use_tail = true; + break; + } - case SHADER_OPCODE_TYPED_SURFACE_READ: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_typed_surface_read(p, dst, src[0], src[1], - inst->mlen, src[2].ud, - inst->header_size); - break; + if (devinfo->gen >= 12) { + /* Use the available SWSB information to stall. A single SYNC is + * sufficient since if there were multiple dependencies, the + * scoreboard algorithm already injected other SYNCs before this + * instruction. + */ + brw_SYNC(p, TGL_SYNC_NOP); + } else { + for (unsigned i = 0; i < inst->sources; i++) { + /* Emit a MOV to force a stall until the instruction producing the + * registers finishes. + */ + brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), + retype(src[i], BRW_REGISTER_TYPE_UW)); + } - case SHADER_OPCODE_TYPED_SURFACE_WRITE: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud, - inst->header_size); - break; + if (inst->sources > 1) + multiple_instructions_emitted = true; + } - case SHADER_OPCODE_MEMORY_FENCE: - brw_memory_fence(p, dst); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { @@ -2183,12 +2335,89 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_find_live_channel(p, dst, mask); break; } - + case FS_OPCODE_LOAD_LIVE_CHANNELS: { + assert(devinfo->gen >= 8); + assert(inst->force_writemask_all && inst->group == 0); + assert(inst->dst.file == BAD_FILE); + brw_set_default_exec_size(p, BRW_EXECUTE_1); + brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg), + BRW_REGISTER_TYPE_UD), + retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + break; + } case SHADER_OPCODE_BROADCAST: assert(inst->force_writemask_all); brw_broadcast(p, dst, src[0], src[1]); break; + case SHADER_OPCODE_SHUFFLE: + generate_shuffle(inst, dst, src[0], src[1]); + break; + + case SHADER_OPCODE_SEL_EXEC: + assert(inst->force_writemask_all); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, src[1]); + brw_set_default_mask_control(p, BRW_MASK_ENABLE); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, dst, src[0]); + break; + + case SHADER_OPCODE_QUAD_SWIZZLE: + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[1].type == BRW_REGISTER_TYPE_UD); + generate_quad_swizzle(inst, dst, src[0], src[1].ud); + break; + + case SHADER_OPCODE_CLUSTER_BROADCAST: { + assert(!src[0].negate && !src[0].abs); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[1].type == BRW_REGISTER_TYPE_UD); + assert(src[2].file == BRW_IMMEDIATE_VALUE); + assert(src[2].type == BRW_REGISTER_TYPE_UD); + const unsigned component = src[1].ud; + const unsigned cluster_size = src[2].ud; + unsigned vstride = cluster_size; + unsigned width = cluster_size; + + /* The maximum exec_size is 32, but the maximum width is only 16. */ + if (inst->exec_size == width) { + vstride = 0; + width = 1; + } + + struct brw_reg strided = stride(suboffset(src[0], component), + vstride, width, 0); + if (type_sz(src[0].type) > 4 && + (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { + /* IVB has an issue (which we found empirically) where it reads + * two address register components per channel for indirectly + * addressed 64-bit sources. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * To work around both of these, we do two integer MOVs insead of + * one 64-bit MOV. Because no double value should ever cross a + * register boundary, it's safe to use the immediate offset in the + * indirect here to handle adding 4 bytes to the offset and avoid + * the extra ADD to the register file. + */ + assert(src[0].type == dst.type); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(strided, BRW_REGISTER_TYPE_D, 0)); + brw_set_default_swsb(p, tgl_swsb_null()); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(strided, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, strided); + } + break; + } + case FS_OPCODE_SET_SAMPLE_ID: generate_set_sample_id(inst, dst, src[0], src[1]); break; @@ -2197,11 +2426,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) generate_pack_half_2x16_split(inst, dst, src[0], src[1]); break; - case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: - case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: - generate_unpack_half_2x16_split(inst, dst, src[0]); - break; - case FS_OPCODE_PLACEHOLDER_HALT: /* This is the place where the final HALT needs to be inserted if * we've emitted any discards. If not, this will emit no code. @@ -2216,24 +2440,29 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case FS_OPCODE_INTERPOLATE_AT_SAMPLE: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); + send_count++; break; case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); + send_count++; break; case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: generate_pixel_interpolator_query(inst, dst, src[0], src[1], GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); + send_count++; break; case CS_OPCODE_CS_TERMINATE: generate_cs_terminate(inst, src[0]); + send_count++; break; case SHADER_OPCODE_BARRIER: generate_barrier(inst, src[0]); + send_count++; break; case BRW_OPCODE_DIM: @@ -2243,9 +2472,22 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); break; - case SHADER_OPCODE_RND_MODE: + case SHADER_OPCODE_RND_MODE: { assert(src[0].file == BRW_IMMEDIATE_VALUE); - brw_rounding_mode(p, (brw_rnd_mode) src[0].d); + /* + * Changes the floating point rounding mode updating the control + * register field defined at cr0.0[5-6] bits. + */ + enum brw_rnd_mode mode = + (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT); + brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK); + } + break; + + case SHADER_OPCODE_FLOAT_CONTROL_MODE: + assert(src[0].file == BRW_IMMEDIATE_VALUE); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + brw_float_controls_mode(p, src[0].d, src[1].d); break; default: @@ -2267,8 +2509,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) if (inst->conditional_mod) brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); - brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); - brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + if (devinfo->gen < 12) { + brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); + brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); + } } } @@ -2292,33 +2536,77 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) int after_size = p->next_insn_offset - start_offset; if (unlikely(debug_flag)) { - fprintf(stderr, "Native code for %s\n" - "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" - " bytes (%.0f%%)\n", - shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, - spill_count, fill_count, promoted_constants, before_size, after_size, + unsigned char sha1[21]; + char sha1buf[41]; + + _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst), + after_size, sha1); + _mesa_sha1_format(sha1buf, sha1); + + fprintf(stderr, "Native code for %s (sha1 %s)\n" + "SIMD%d shader: %d instructions. %d loops. %u cycles. " + "%d:%d spills:fills, %u sends, " + "scheduled with mode %s. " + "Promoted %u constants. " + "Compacted %d to %d bytes (%.0f%%)\n", + shader_name, sha1buf, + dispatch_width, before_size / 16, + loop_count, perf.latency, + spill_count, fill_count, send_count, + shader_stats.scheduler_mode, + shader_stats.promoted_constants, + before_size, after_size, 100.0f * (before_size - after_size) / before_size); - dump_assembly(p->store, disasm_info); + /* overriding the shader makes disasm_info invalid */ + if (!brw_try_override_assembly(p, start_offset, sha1buf)) { + dump_assembly(p->store, start_offset, p->next_insn_offset, + disasm_info, perf.block_latency); + } else { + fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); + } } ralloc_free(disasm_info); assert(validated); compiler->shader_debug_log(log_data, "%s SIMD%d shader: %d inst, %d loops, %u cycles, " - "%d:%d spills:fills, Promoted %u constants, " + "%d:%d spills:fills, %u sends, " + "scheduled with mode %s, " + "Promoted %u constants, " "compacted %d to %d bytes.", _mesa_shader_stage_to_abbrev(stage), - dispatch_width, before_size / 16, - loop_count, cfg->cycle_count, spill_count, - fill_count, promoted_constants, before_size, - after_size); + dispatch_width, before_size / 16 - nop_count, + loop_count, perf.latency, + spill_count, fill_count, send_count, + shader_stats.scheduler_mode, + shader_stats.promoted_constants, + before_size, after_size); + if (stats) { + stats->dispatch_width = dispatch_width; + stats->instructions = before_size / 16 - nop_count; + stats->sends = send_count; + stats->loops = loop_count; + stats->cycles = perf.latency; + stats->spills = spill_count; + stats->fills = fill_count; + } return start_offset; } +void +fs_generator::add_const_data(void *data, unsigned size) +{ + assert(prog_data->const_data_size == 0); + if (size > 0) { + prog_data->const_data_size = size; + prog_data->const_data_offset = brw_append_data(p, data, size, 32); + } +} + const unsigned * -fs_generator::get_assembly(unsigned int *assembly_size) +fs_generator::get_assembly() { - return brw_get_program(p, assembly_size); + return brw_get_program(p, &prog_data->program_size); }