desc, desc_imm, ex_desc, ex_desc_imm,
inst->eot);
if (inst->check_tdr)
- brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDSC);
+ brw_inst_set_opcode(p->devinfo, brw_last_inst,
+ devinfo->gen >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
} else {
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
inst->eot);
* of case-by-case work. It's just not worth it.
*/
brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
if (type_sz(reg.type) > 4 &&
((devinfo->gen == 7 && !devinfo->is_haswell) ||
*/
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
} else {
src.hstride - 1));
/* Add on the register start offset */
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
if (type_sz(src.type) > 4 &&
assert(dst.hstride == 1);
brw_MOV(p, dst_d,
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, byte_offset(dst_d, 4),
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
} else {
retype(brw_VxH_indirect(0, 0), src.type));
}
}
+
+ brw_set_default_swsb(p, tgl_swsb_null());
}
}
4 * inst->dst.stride, 1, 4 * inst->dst.stride),
stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
- brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
- brw_inst_set_no_dd_check(devinfo, insn, c > 0);
+ if (devinfo->gen < 12) {
+ brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
+ brw_inst_set_no_dd_check(devinfo, insn, c > 0);
+ }
+
+ brw_set_default_swsb(p, tgl_swsb_null());
}
break;
brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
brw_set_src0(p, send, header);
- brw_set_src1(p, send, brw_imm_ud(0u));
+ if (devinfo->gen < 12)
+ brw_set_src1(p, send, brw_imm_ud(0u));
brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
brw_set_dest(p, insn, brw_null_reg());
brw_set_src0(p, insn, payload);
- brw_set_src1(p, insn, brw_imm_ud(0u));
+ if (devinfo->gen < 12)
+ brw_set_src1(p, insn, brw_imm_ud(0u));
brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
- brw_set_src1(p, insn, brw_imm_ud(0u));
+ if (devinfo->gen < 12)
+ brw_set_src1(p, insn, brw_imm_ud(0u));
/* Terminate a compute shader by sending a message to the thread spawner.
*/
brw_inst_set_header_present(devinfo, insn, false);
brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
- brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
- /* Note that even though the thread has a URB resource associated with it,
- * we set the "do not dereference URB" bit, because the URB resource is
- * managed by the fixed-function unit, so it will free it automatically.
- */
- brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+ if (devinfo->gen < 11) {
+ brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
+
+ /* Note that even though the thread has a URB resource associated with it,
+ * we set the "do not dereference URB" bit, because the URB resource is
+ * managed by the fixed-function unit, so it will free it automatically.
+ */
+ brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+ }
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
}
fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
{
brw_barrier(p, src);
- brw_WAIT(p);
+ if (devinfo->gen >= 12) {
+ brw_set_default_swsb(p, tgl_swsb_null());
+ brw_SYNC(p, TGL_SYNC_BAR);
+ } else {
+ brw_WAIT(p);
+ }
}
bool
/* Set up an implied move from g0 to the MRF. */
src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
} else {
+ const tgl_swsb swsb = brw_get_default_swsb(p);
assert(inst->base_mrf != -1);
struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
brw_push_insn_state(p);
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
brw_set_default_exec_size(p, BRW_EXECUTE_8);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
/* Explicitly set up the message header by copying g0 to the MRF. */
brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
brw_set_default_exec_size(p, BRW_EXECUTE_1);
if (inst->offset) {
}
brw_pop_insn_state(p);
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
}
}
brw_ADD(p, byte_offset(dst, g * type_size),
negate(byte_offset(src, g * type_size)),
byte_offset(src, (g + 2) * type_size));
+ brw_set_default_swsb(p, tgl_swsb_null());
}
brw_pop_insn_state(p);
} else {
const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
MIN2(16, inst->exec_size);
const unsigned block_size = 4 * lower_size / REG_SIZE;
+ const tgl_swsb swsb = brw_get_default_swsb(p);
assert(inst->mlen != 0);
brw_push_insn_state(p);
for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
brw_set_default_group(p, inst->group + lower_size * i);
+ if (i > 0) {
+ brw_set_default_swsb(p, tgl_swsb_null());
+ brw_SYNC(p, TGL_SYNC_ALLRD);
+ } else {
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
+ }
+
brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
+ if (i + 1 < inst->exec_size / lower_size)
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
+ else
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
+
brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
block_size,
inst->offset + block_size * REG_SIZE * i);
BRW_DATAPORT_READ_TARGET_DATA_CACHE));
} else {
+ const tgl_swsb swsb = brw_get_default_swsb(p);
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
/* a0.0 = surf_index & 0xff */
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
brw_set_dest(p, insn_and, addr);
brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
/* dst = send(payload, a0.0 | <descriptor>) */
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
brw_send_indirect_message(
p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
retype(dst, BRW_REGISTER_TYPE_UD),
brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
brw_inst_set_compression(devinfo, insn, lower_size > 8);
+ brw_set_default_swsb(p, tgl_swsb_null());
}
}
/* Now the form:
* 0xhhhh0000
*/
+ brw_set_default_swsb(p, tgl_swsb_regdist(1));
brw_SHL(p, dst, dst, brw_imm_ud(16u));
/* And, finally the form of packHalf2x16's output:
struct brw_reg offset,
struct brw_reg value)
{
+ const tgl_swsb swsb = brw_get_default_swsb(p);
+
assert(devinfo->gen >= 7);
brw_push_insn_state(p);
brw_set_default_mask_control(p, true);
+ brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
assert(payload.file == BRW_GENERAL_REGISTER_FILE);
struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
* out of this path, so we just emit the MOVs from here.
*/
brw_MOV(p, payload_offset, offset);
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, payload_value, value);
+ brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
brw_shader_time_add(p, payload,
prog_data->binding_table.shader_time_start);
brw_pop_insn_state(p);
this->dispatch_width = dispatch_width;
int start_offset = p->next_insn_offset;
+
+ /* `send_count` explicitly does not include spills or fills, as we'd
+ * like to use it as a metric for intentional memory access or other
+ * shared function use. Otherwise, subtle changes to scheduling or
+ * register allocation could cause it to fluctuate wildly - and that
+ * effect is already counted in spill/fill counts.
+ */
int spill_count = 0, fill_count = 0;
- int loop_count = 0;
+ int loop_count = 0, send_count = 0;
struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
brw_set_default_saturate(p, inst->saturate);
brw_set_default_mask_control(p, inst->force_writemask_all);
brw_set_default_acc_write_control(p, inst->writes_accumulator);
+ brw_set_default_swsb(p, inst->sched);
unsigned exec_size = inst->exec_size;
if (devinfo->gen == 7 && !devinfo->is_haswell &&
assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
switch (inst->opcode) {
+ case BRW_OPCODE_SYNC:
+ assert(src[0].file == BRW_IMMEDIATE_VALUE);
+ brw_SYNC(p, tgl_sync_function(src[0].ud));
+ break;
case BRW_OPCODE_MOV:
brw_MOV(p, dst, src[0]);
break;
brw_math_function(inst->opcode),
inst->base_mrf, src[0],
BRW_MATH_PRECISION_FULL);
+ send_count++;
}
break;
case SHADER_OPCODE_INT_QUOTIENT:
gen4_math(p, dst, brw_math_function(inst->opcode),
inst->base_mrf, src[0],
BRW_MATH_PRECISION_FULL);
+ send_count++;
}
break;
case FS_OPCODE_LINTERP:
case SHADER_OPCODE_SEND:
generate_send(inst, dst, src[0], src[1], src[2],
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
+ if ((inst->desc & 0xff) == BRW_BTI_STATELESS ||
+ (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) {
+ if (inst->size_written)
+ fill_count++;
+ else
+ spill_count++;
+ } else {
+ send_count++;
+ }
break;
case SHADER_OPCODE_GET_BUFFER_SIZE:
generate_get_buffer_size(inst, dst, src[0], src[1]);
+ send_count++;
break;
case SHADER_OPCODE_TEX:
case FS_OPCODE_TXB:
case SHADER_OPCODE_SAMPLEINFO:
assert(inst->src[0].file == BAD_FILE);
generate_tex(inst, dst, src[1], src[2]);
+ send_count++;
break;
case FS_OPCODE_DDX_COARSE:
case SHADER_OPCODE_URB_READ_SIMD8:
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
generate_urb_read(inst, dst, src[0]);
+ send_count++;
break;
case SHADER_OPCODE_URB_WRITE_SIMD8:
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
generate_urb_write(inst, src[0]);
+ send_count++;
break;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
assert(inst->force_writemask_all);
generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
+ send_count++;
break;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
assert(inst->force_writemask_all);
generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+ send_count++;
break;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
+ send_count++;
break;
case FS_OPCODE_REP_FB_WRITE:
case FS_OPCODE_FB_WRITE:
generate_fb_write(inst, src[0]);
+ send_count++;
break;
case FS_OPCODE_FB_READ:
generate_fb_read(inst, dst, src[0]);
+ send_count++;
break;
case FS_OPCODE_DISCARD_JUMP:
assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[2].file == BRW_IMMEDIATE_VALUE);
brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud);
+ send_count++;
+ break;
+
+ case FS_OPCODE_SCHEDULING_FENCE:
+ if (unlikely(debug_flag))
+ disasm_info->use_tail = true;
break;
case SHADER_OPCODE_INTERLOCK:
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
brw_MOV(p, dst, src[1]);
brw_set_default_mask_control(p, BRW_MASK_ENABLE);
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, dst, src[0]);
break;
break;
case SHADER_OPCODE_CLUSTER_BROADCAST: {
- assert(src[0].type == dst.type);
assert(!src[0].negate && !src[0].abs);
assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[1].type == BRW_REGISTER_TYPE_UD);
* indirect here to handle adding 4 bytes to the offset and avoid
* the extra ADD to the register file.
*/
+ assert(src[0].type == dst.type);
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
subscript(strided, BRW_REGISTER_TYPE_D, 0));
+ brw_set_default_swsb(p, tgl_swsb_null());
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
subscript(strided, BRW_REGISTER_TYPE_D, 1));
} else {
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
+ send_count++;
break;
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
+ send_count++;
break;
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
+ send_count++;
break;
case CS_OPCODE_CS_TERMINATE:
generate_cs_terminate(inst, src[0]);
+ send_count++;
break;
case SHADER_OPCODE_BARRIER:
generate_barrier(inst, src[0]);
+ send_count++;
break;
case BRW_OPCODE_DIM:
if (inst->conditional_mod)
brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
- brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
- brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+ if (devinfo->gen < 12) {
+ brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+ }
}
}
fprintf(stderr, "Native code for %s (sha1 %s)\n"
"SIMD%d shader: %d instructions. %d loops. %u cycles. "
- "%d:%d spills:fills. "
+ "%d:%d spills:fills, %u sends, "
"scheduled with mode %s. "
"Promoted %u constants. "
"Compacted %d to %d bytes (%.0f%%)\n",
shader_name, sha1buf,
dispatch_width, before_size / 16,
loop_count, cfg->cycle_count,
- spill_count, fill_count,
+ spill_count, fill_count, send_count,
shader_stats.scheduler_mode,
shader_stats.promoted_constants,
before_size, after_size,
compiler->shader_debug_log(log_data,
"%s SIMD%d shader: %d inst, %d loops, %u cycles, "
- "%d:%d spills:fills, "
+ "%d:%d spills:fills, %u sends, "
"scheduled with mode %s, "
"Promoted %u constants, "
"compacted %d to %d bytes.",
_mesa_shader_stage_to_abbrev(stage),
dispatch_width, before_size / 16,
loop_count, cfg->cycle_count,
- spill_count, fill_count,
+ spill_count, fill_count, send_count,
shader_stats.scheduler_mode,
shader_stats.promoted_constants,
before_size, after_size);