brw_inst_set_binding_table_index(devinfo, insn, bti);
}
-unsigned
+void
brw_memory_fence(struct brw_codegen *p,
struct brw_reg dst,
struct brw_reg src,
enum opcode send_op,
- bool stall,
+ enum brw_message_target sfid,
+ bool commit_enable,
unsigned bti)
{
const struct gen_device_info *devinfo = p->devinfo;
- const bool commit_enable = stall ||
- devinfo->gen >= 10 || /* HSD ES # 1404612949 */
- (devinfo->gen == 7 && !devinfo->is_haswell);
- struct brw_inst *insn;
- unsigned fences = 0;
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_exec_size(p, BRW_EXECUTE_1);
dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
/* Set dst as destination for dependency tracking, the MEMORY_FENCE
* message doesn't write anything back.
*/
- insn = next_insn(p, send_op);
+ struct brw_inst *insn = next_insn(p, send_op);
+ brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+ brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
brw_set_dest(p, insn, dst);
brw_set_src0(p, insn, src);
- brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
- commit_enable, bti);
- fences++;
-
- if (devinfo->gen == 7 && !devinfo->is_haswell) {
- /* IVB does typed surface access through the render cache, so we need to
- * flush it too. Use a different register so both flushes can be
- * pipelined by the hardware.
- */
- insn = next_insn(p, send_op);
- brw_set_dest(p, insn, offset(dst, 1));
- brw_set_src0(p, insn, src);
- brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
- commit_enable, bti);
- fences++;
-
- /* Now write the response of the second message into the response of the
- * first to trigger a pipeline stall -- This way future render and data
- * cache messages will be properly ordered with respect to past data and
- * render cache messages.
- */
- brw_MOV(p, dst, offset(dst, 1));
- }
-
- if (stall) {
- brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST,
- brw_get_default_swsb(p).sbid));
-
- brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
- }
-
- brw_pop_insn_state(p);
-
- return fences;
+ brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
}
void
generate_shader_time_add(inst, src[0], src[1], src[2]);
break;
+ case SHADER_OPCODE_INTERLOCK:
case SHADER_OPCODE_MEMORY_FENCE: {
assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[2].file == BRW_IMMEDIATE_VALUE);
- const unsigned sends =
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud,
- src[2].ud);
- send_count += sends;
+
+ const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+ BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+ brw_memory_fence(p, dst, src[0], send_op,
+ brw_message_target(inst->sfid),
+ /* commit_enable */ src[1].ud,
+ /* bti */ src[2].ud);
+ send_count++;
break;
}
break;
- case SHADER_OPCODE_INTERLOCK:
- assert(devinfo->gen >= 9);
- /* The interlock is basically a memory fence issued via sendc */
- brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
- break;
-
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
const struct brw_reg mask =
brw_stage_has_packed_dispatch(devinfo, stage,
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
- case nir_intrinsic_memory_barrier: {
+ case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock: {
bool l3_fence, slm_fence;
- if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) {
+ const enum opcode opcode =
+ instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
+ SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_scoped_memory_barrier: {
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
l3_fence = modes & (nir_var_shader_out |
nir_var_mem_ssbo |
nir_var_mem_global);
slm_fence = modes & nir_var_mem_shared;
- } else {
+ break;
+ }
+
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock:
+ /* For beginInvocationInterlockARB(), we will generate a memory fence
+ * but with a different opcode so that generator can pick SENDC
+ * instead of SEND.
+ *
+ * For endInvocationInterlockARB(), we need to insert a memory fence which
+ * stalls in the shader until the memory transactions prior to that
+ * fence are complete. This ensures that the shader does not end before
+ * any writes from its critical section have landed. Otherwise, you can
+ * end up with a case where the next invocation on that pixel properly
+ * stalls for previous FS invocation on its pixel to complete but
+ * doesn't actually wait for the dataport memory transactions from that
+ * thread to land before submitting its own.
+ *
+ * Handling them here will allow the logic for IVB render cache (see
+ * below) to be reused.
+ */
+ l3_fence = true;
+ slm_fence = false;
+ break;
+
+ default:
l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+ break;
}
if (stage != MESA_SHADER_COMPUTE)
l3_fence = true;
}
+ /* IVB does typed surface access through the render cache, so we need
+ * to flush it too.
+ */
+ const bool needs_render_fence =
+ devinfo->gen == 7 && !devinfo->is_haswell;
+
/* Be conservative in Gen11+ and always stall in a fence. Since there
* are two different fences, and shader might want to synchronize
* between them.
* TODO: When emitting more than one fence, it might help emit all
* the fences first and then generate the stall moves.
*/
- const bool stall = devinfo->gen >= 11;
+ const bool stall = devinfo->gen >= 11 || needs_render_fence ||
+ instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+ const bool commit_enable = stall ||
+ devinfo->gen >= 10; /* HSD ES # 1404612949 */
const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
if (l3_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- /* bti */ brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ if (needs_render_fence) {
+ fs_inst *render_fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence->dst, render_fence->dst);
+ } else if (stall) {
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence->dst);
+ }
}
if (slm_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- brw_imm_ud(GEN7_BTI_SLM))
- ->size_written = 2 * REG_SIZE;
+ assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(GEN7_BTI_SLM));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ if (stall) {
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence->dst);
+ }
}
if (!l3_fence && !slm_fence)
break;
}
- case nir_intrinsic_begin_invocation_interlock: {
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-
- ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
- case nir_intrinsic_end_invocation_interlock: {
- /* For endInvocationInterlock(), we need to insert a memory fence which
- * stalls in the shader until the memory transactions prior to that
- * fence are complete. This ensures that the shader does not end before
- * any writes from its critical section have landed. Otherwise, you can
- * end up with a case where the next invocation on that pixel properly
- * stalls for previous FS invocation on its pixel to complete but
- * doesn't actually wait for the dataport memory transactions from that
- * thread to land before submitting its own.
- */
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
default:
unreachable("unknown intrinsic");
}