fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
struct brw_stage_prog_data *prog_data,
- struct shader_stats shader_stats,
bool runtime_check_aads_emit,
gl_shader_stage stage)
: compiler(compiler), log_data(log_data),
devinfo(compiler->devinfo),
prog_data(prog_data),
- shader_stats(shader_stats),
runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
stage(stage), mem_ctx(mem_ctx)
{
bool
fs_generator::patch_discard_jumps_to_fb_writes()
{
- if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+ if (this->discard_halt_patches.is_empty())
return false;
int scale = brw_jump_scale(p->devinfo);
- /* There is a somewhat strange undocumented requirement of using
- * HALT, according to the simulator. If some channel has HALTed to
- * a particular UIP, then by the end of the program, every channel
- * must have HALTed to that UIP. Furthermore, the tracking is a
- * stack, so you can't do the final halt of a UIP after starting
- * halting to a new UIP.
- *
- * Symptoms of not emitting this instruction on actual hardware
- * included GPU hangs and sparkly rendering on the piglit discard
- * tests.
- */
- brw_inst *last_halt = gen6_HALT(p);
- brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
- brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+ if (devinfo->gen >= 6) {
+ /* There is a somewhat strange undocumented requirement of using
+ * HALT, according to the simulator. If some channel has HALTed to
+ * a particular UIP, then by the end of the program, every channel
+ * must have HALTed to that UIP. Furthermore, the tracking is a
+ * stack, so you can't do the final halt of a UIP after starting
+ * halting to a new UIP.
+ *
+ * Symptoms of not emitting this instruction on actual hardware
+ * included GPU hangs and sparkly rendering on the piglit discard
+ * tests.
+ */
+ brw_inst *last_halt = brw_HALT(p);
+ brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+ brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+ }
int ip = p->nr_insn;
brw_inst *patch = &p->store[patch_ip->ip];
assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
- /* HALT takes a half-instruction distance from the pre-incremented IP. */
- brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+ if (devinfo->gen >= 6) {
+ /* HALT takes a half-instruction distance from the pre-incremented IP. */
+ brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+ } else {
+ brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
+ }
}
this->discard_halt_patches.make_empty();
+
+ if (devinfo->gen < 6) {
+ /* From the g965 PRM:
+ *
+ * "As DMask is not automatically reloaded into AMask upon completion
+ * of this instruction, software has to manually restore AMask upon
+ * completion."
+ *
+ * DMask lives in the bottom 16 bits of sr0.1.
+ */
+ brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
+ retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
+ brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
+ brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
+ brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
+ brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
+ }
+
+ if (devinfo->gen == 4 && !devinfo->is_g4x) {
+ /* From the g965 PRM:
+ *
+ * "[DevBW, DevCL] Erratum: The subfields in mask stack register are
+ * reset to zero during graphics reset, however, they are not
+ * initialized at thread dispatch. These subfields will retain the
+ * values from the previous thread. Software should make sure the
+ * mask stack is empty (reset to zero) before terminating the thread.
+ * In case that this is not practical, software may have to reset the
+ * mask stack at the beginning of each kernel, which will impact the
+ * performance."
+ *
+ * Luckily we can rely on:
+ *
+ * "[DevBW, DevCL] This register access restriction is not
+ * applicable, hardware does ensure execution pipeline coherency,
+ * when a mask stack register is used as an explicit source and/or
+ * destination."
+ */
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+ brw_set_default_exec_size(p, BRW_EXECUTE_2);
+ brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
+
+ brw_set_default_exec_size(p, BRW_EXECUTE_16);
+ /* Reset the if stack. */
+ brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
+ brw_imm_uw(0));
+
+ brw_pop_insn_state(p);
+ }
+
return true;
}
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
struct brw_reg addr = vec8(brw_address_reg(0));
+ /* Whether we can use destination dependency control without running the
+ * risk of a hang if an instruction gets shot down.
+ */
+ const bool use_dep_ctrl = !inst->predicate &&
+ inst->exec_size == dispatch_width;
+ brw_inst *insn;
+
/* The destination stride of an instruction (in bytes) must be greater
* than or equal to the size of the rest of the instruction. Since the
* address register is of type UW, we can't use a D-type instruction.
* In the end, while base_offset is nice to look at in the generated
* code, using it saves us 0 instructions and would require quite a bit
* of case-by-case work. It's just not worth it.
+ *
+ * Due to a hardware bug some platforms (particularly Gen11+) seem to
+ * require the address components of all channels to be valid whether or
+ * not they're active, which causes issues if we use VxH addressing
+ * under non-uniform control-flow. We can easily work around that by
+ * initializing the whole address register with a pipelined NoMask MOV
+ * instruction.
*/
- brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+ if (devinfo->gen >= 7) {
+ insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+ brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
+ if (devinfo->gen >= 12)
+ brw_set_default_swsb(p, tgl_swsb_null());
+ else
+ brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
+ }
+
+ insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+ if (devinfo->gen >= 12)
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
+ else if (devinfo->gen >= 7)
+ brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
if (type_sz(reg.type) > 4 &&
((devinfo->gen == 7 && !devinfo->is_haswell) ||
devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) ||
- !devinfo->has_64bit_types)) {
+ !devinfo->has_64bit_float)) {
/* IVB has an issue (which we found empirically) where it reads two
* address register components per channel for indirectly addressed
* 64-bit sources.
*/
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
} else {
/* Take into account the component size and horizontal stride. */
assert(src.vstride == src.hstride + src.width);
brw_SHL(p, addr, group_idx,
- brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
+ brw_imm_uw(util_logbase2(type_sz(src.type)) +
src.hstride - 1));
/* Add on the register start offset */
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
if (type_sz(src.type) > 4 &&
assert(dst.hstride == 1);
brw_MOV(p, dst_d,
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, byte_offset(dst_d, 4),
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
} else {
retype(brw_VxH_indirect(0, 0), src.type));
}
}
+
+ brw_set_default_swsb(p, tgl_swsb_null());
}
}
brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
brw_inst_set_no_dd_check(devinfo, insn, c > 0);
}
+
+ brw_set_default_swsb(p, tgl_swsb_null());
}
break;
brw_inst_set_header_present(devinfo, insn, false);
brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
- brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
- /* Note that even though the thread has a URB resource associated with it,
- * we set the "do not dereference URB" bit, because the URB resource is
- * managed by the fixed-function unit, so it will free it automatically.
- */
- brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+ if (devinfo->gen < 11) {
+ brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
+
+ /* Note that even though the thread has a URB resource associated with it,
+ * we set the "do not dereference URB" bit, because the URB resource is
+ * managed by the fixed-function unit, so it will free it automatically.
+ */
+ brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+ }
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
}
fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
{
brw_barrier(p, src);
- brw_WAIT(p);
+ if (devinfo->gen >= 12) {
+ brw_set_default_swsb(p, tgl_swsb_null());
+ brw_SYNC(p, TGL_SYNC_BAR);
+ } else {
+ brw_WAIT(p);
+ }
}
bool
/* Set up an implied move from g0 to the MRF. */
src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
} else {
+ const tgl_swsb swsb = brw_get_default_swsb(p);
assert(inst->base_mrf != -1);
struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
brw_push_insn_state(p);
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
brw_set_default_exec_size(p, BRW_EXECUTE_8);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
/* Explicitly set up the message header by copying g0 to the MRF. */
brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
brw_set_default_exec_size(p, BRW_EXECUTE_1);
if (inst->offset) {
}
brw_pop_insn_state(p);
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
}
}
brw_ADD(p, byte_offset(dst, g * type_size),
negate(byte_offset(src, g * type_size)),
byte_offset(src, (g + 2) * type_size));
+ brw_set_default_swsb(p, tgl_swsb_null());
}
brw_pop_insn_state(p);
} else {
void
fs_generator::generate_discard_jump(fs_inst *)
{
- assert(devinfo->gen >= 6);
-
/* This HALT will be patched up at FB write time to point UIP at the end of
* the program, and at brw_uip_jip() JIP will be set to the end of the
* current block (or the program).
*/
this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
- gen6_HALT(p);
+ brw_HALT(p);
}
void
const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
MIN2(16, inst->exec_size);
const unsigned block_size = 4 * lower_size / REG_SIZE;
+ const tgl_swsb swsb = brw_get_default_swsb(p);
assert(inst->mlen != 0);
brw_push_insn_state(p);
for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
brw_set_default_group(p, inst->group + lower_size * i);
+ if (i > 0) {
+ assert(swsb.mode & TGL_SBID_SET);
+ brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid));
+ } else {
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+ }
+
brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
block_size,
inst->offset + block_size * REG_SIZE * i);
BRW_DATAPORT_READ_TARGET_DATA_CACHE));
} else {
+ const tgl_swsb swsb = brw_get_default_swsb(p);
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
/* a0.0 = surf_index & 0xff */
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
brw_set_dest(p, insn_and, addr);
brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
/* dst = send(payload, a0.0 | <descriptor>) */
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
brw_send_indirect_message(
p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
retype(dst, BRW_REGISTER_TYPE_UD),
brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
brw_inst_set_compression(devinfo, insn, lower_size > 8);
+ brw_set_default_swsb(p, tgl_swsb_null());
}
}
/* Now the form:
* 0xhhhh0000
*/
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
brw_SHL(p, dst, dst, brw_imm_ud(16u));
/* And, finally the form of packHalf2x16's output:
struct brw_reg offset,
struct brw_reg value)
{
+ const tgl_swsb swsb = brw_get_default_swsb(p);
+
assert(devinfo->gen >= 7);
brw_push_insn_state(p);
brw_set_default_mask_control(p, true);
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
assert(payload.file == BRW_GENERAL_REGISTER_FILE);
struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
* out of this path, so we just emit the MOVs from here.
*/
brw_MOV(p, payload_offset, offset);
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, payload_value, value);
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
brw_shader_time_add(p, payload,
prog_data->binding_table.shader_time_start);
brw_pop_insn_state(p);
int
fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
+ struct shader_stats shader_stats,
+ const brw::performance &perf,
struct brw_compile_stats *stats)
{
/* align to 64 byte boundary. */
this->dispatch_width = dispatch_width;
int start_offset = p->next_insn_offset;
+
+ /* `send_count` explicitly does not include spills or fills, as we'd
+ * like to use it as a metric for intentional memory access or other
+ * shared function use. Otherwise, subtle changes to scheduling or
+ * register allocation could cause it to fluctuate wildly - and that
+ * effect is already counted in spill/fill counts.
+ */
int spill_count = 0, fill_count = 0;
- int loop_count = 0;
+ int loop_count = 0, send_count = 0, nop_count = 0;
+ bool is_accum_used = false;
struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
inst->dst.component_size(inst->exec_size) > REG_SIZE) {
brw_NOP(p);
last_insn_offset = p->next_insn_offset;
+
+ /* In order to avoid spurious instruction count differences when the
+ * instruction schedule changes, keep track of the number of inserted
+ * NOPs.
+ */
+ nop_count++;
+ }
+
+ /* GEN:BUG:14010017096:
+ *
+ * Clear accumulator register before end of thread.
+ */
+ if (inst->eot && is_accum_used && devinfo->gen >= 12) {
+ brw_set_default_exec_size(p, BRW_EXECUTE_16);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
+ last_insn_offset = p->next_insn_offset;
+ }
+
+ if (!is_accum_used && !inst->eot) {
+ is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
+ inst->dst.is_accumulator();
}
if (unlikely(debug_flag))
brw_set_default_saturate(p, inst->saturate);
brw_set_default_mask_control(p, inst->force_writemask_all);
brw_set_default_acc_write_control(p, inst->writes_accumulator);
+ brw_set_default_swsb(p, inst->sched);
unsigned exec_size = inst->exec_size;
if (devinfo->gen == 7 && !devinfo->is_haswell &&
brw_math_function(inst->opcode),
inst->base_mrf, src[0],
BRW_MATH_PRECISION_FULL);
+ send_count++;
}
break;
case SHADER_OPCODE_INT_QUOTIENT:
gen4_math(p, dst, brw_math_function(inst->opcode),
inst->base_mrf, src[0],
BRW_MATH_PRECISION_FULL);
+ send_count++;
}
break;
case FS_OPCODE_LINTERP:
case SHADER_OPCODE_SEND:
generate_send(inst, dst, src[0], src[1], src[2],
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
+ if ((inst->desc & 0xff) == BRW_BTI_STATELESS ||
+ (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) {
+ if (inst->size_written)
+ fill_count++;
+ else
+ spill_count++;
+ } else {
+ send_count++;
+ }
break;
case SHADER_OPCODE_GET_BUFFER_SIZE:
generate_get_buffer_size(inst, dst, src[0], src[1]);
+ send_count++;
break;
case SHADER_OPCODE_TEX:
case FS_OPCODE_TXB:
case SHADER_OPCODE_SAMPLEINFO:
assert(inst->src[0].file == BAD_FILE);
generate_tex(inst, dst, src[1], src[2]);
+ send_count++;
break;
case FS_OPCODE_DDX_COARSE:
case SHADER_OPCODE_URB_READ_SIMD8:
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
generate_urb_read(inst, dst, src[0]);
+ send_count++;
break;
case SHADER_OPCODE_URB_WRITE_SIMD8:
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
generate_urb_write(inst, src[0]);
+ send_count++;
break;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
assert(inst->force_writemask_all);
generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
+ send_count++;
break;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
assert(inst->force_writemask_all);
generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+ send_count++;
break;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
+ send_count++;
break;
case FS_OPCODE_REP_FB_WRITE:
case FS_OPCODE_FB_WRITE:
generate_fb_write(inst, src[0]);
+ send_count++;
break;
case FS_OPCODE_FB_READ:
generate_fb_read(inst, dst, src[0]);
+ send_count++;
break;
case FS_OPCODE_DISCARD_JUMP:
generate_shader_time_add(inst, src[0], src[1], src[2]);
break;
- case SHADER_OPCODE_MEMORY_FENCE:
+ case SHADER_OPCODE_INTERLOCK:
+ case SHADER_OPCODE_MEMORY_FENCE: {
assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[2].file == BRW_IMMEDIATE_VALUE);
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud);
+
+ const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+ BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+ brw_memory_fence(p, dst, src[0], send_op,
+ brw_message_target(inst->sfid),
+ /* commit_enable */ src[1].ud,
+ /* bti */ src[2].ud);
+ send_count++;
break;
+ }
+
+ case FS_OPCODE_SCHEDULING_FENCE:
+ if (inst->sources == 0 && inst->sched.regdist == 0 &&
+ inst->sched.mode == TGL_SBID_NULL) {
+ if (unlikely(debug_flag))
+ disasm_info->use_tail = true;
+ break;
+ }
+
+ if (devinfo->gen >= 12) {
+ /* Use the available SWSB information to stall. A single SYNC is
+ * sufficient since if there were multiple dependencies, the
+ * scoreboard algorithm already injected other SYNCs before this
+ * instruction.
+ */
+ brw_SYNC(p, TGL_SYNC_NOP);
+ } else {
+ for (unsigned i = 0; i < inst->sources; i++) {
+ /* Emit a MOV to force a stall until the instruction producing the
+ * registers finishes.
+ */
+ brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
+ retype(src[i], BRW_REGISTER_TYPE_UW));
+ }
+
+ if (inst->sources > 1)
+ multiple_instructions_emitted = true;
+ }
- case SHADER_OPCODE_INTERLOCK:
- assert(devinfo->gen >= 9);
- /* The interlock is basically a memory fence issued via sendc */
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
break;
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
brw_find_live_channel(p, dst, mask);
break;
}
-
+ case FS_OPCODE_LOAD_LIVE_CHANNELS: {
+ assert(devinfo->gen >= 8);
+ assert(inst->force_writemask_all && inst->group == 0);
+ assert(inst->dst.file == BAD_FILE);
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
+ brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
+ BRW_REGISTER_TYPE_UD),
+ retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
+ break;
+ }
case SHADER_OPCODE_BROADCAST:
assert(inst->force_writemask_all);
brw_broadcast(p, dst, src[0], src[1]);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_MOV(p, dst, src[1]);
brw_set_default_mask_control(p, BRW_MASK_ENABLE);
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, dst, src[0]);
break;
assert(src[0].type == dst.type);
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
subscript(strided, BRW_REGISTER_TYPE_D, 0));
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
subscript(strided, BRW_REGISTER_TYPE_D, 1));
} else {
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
+ send_count++;
break;
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
+ send_count++;
break;
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
+ send_count++;
break;
case CS_OPCODE_CS_TERMINATE:
generate_cs_terminate(inst, src[0]);
+ send_count++;
break;
case SHADER_OPCODE_BARRIER:
generate_barrier(inst, src[0]);
+ send_count++;
break;
case BRW_OPCODE_DIM:
fprintf(stderr, "Native code for %s (sha1 %s)\n"
"SIMD%d shader: %d instructions. %d loops. %u cycles. "
- "%d:%d spills:fills. "
+ "%d:%d spills:fills, %u sends, "
"scheduled with mode %s. "
"Promoted %u constants. "
"Compacted %d to %d bytes (%.0f%%)\n",
shader_name, sha1buf,
dispatch_width, before_size / 16,
- loop_count, cfg->cycle_count,
- spill_count, fill_count,
+ loop_count, perf.latency,
+ spill_count, fill_count, send_count,
shader_stats.scheduler_mode,
shader_stats.promoted_constants,
before_size, after_size,
/* overriding the shader makes disasm_info invalid */
if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
- dump_assembly(p->store, disasm_info);
+ dump_assembly(p->store, disasm_info, perf.block_latency);
} else {
fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
}
compiler->shader_debug_log(log_data,
"%s SIMD%d shader: %d inst, %d loops, %u cycles, "
- "%d:%d spills:fills, "
+ "%d:%d spills:fills, %u sends, "
"scheduled with mode %s, "
"Promoted %u constants, "
"compacted %d to %d bytes.",
_mesa_shader_stage_to_abbrev(stage),
- dispatch_width, before_size / 16,
- loop_count, cfg->cycle_count,
- spill_count, fill_count,
+ dispatch_width, before_size / 16 - nop_count,
+ loop_count, perf.latency,
+ spill_count, fill_count, send_count,
shader_stats.scheduler_mode,
shader_stats.promoted_constants,
before_size, after_size);
if (stats) {
stats->dispatch_width = dispatch_width;
- stats->instructions = before_size / 16;
+ stats->instructions = before_size / 16 - nop_count;
+ stats->sends = send_count;
stats->loops = loop_count;
- stats->cycles = cfg->cycle_count;
+ stats->cycles = perf.latency;
stats->spills = spill_count;
stats->fills = fill_count;
}