assert(uniforms == prog_data->nr_params);
uint32_t *param;
- if (brw_cs_prog_data(prog_data)->uses_variable_group_size) {
+ if (nir->info.cs.local_size_variable &&
+ compiler->lower_variable_group_size) {
param = brw_stage_prog_data_add_params(prog_data, 3);
for (unsigned i = 0; i < 3; i++) {
param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
static fs_reg
fetch_render_target_array_index(const fs_builder &bld)
{
- if (bld.shader->devinfo->gen >= 6) {
+ if (bld.shader->devinfo->gen >= 12) {
+ /* The render target array index is provided in the thread payload as
+ * bits 26:16 of r1.1.
+ */
+ const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
+ brw_imm_uw(0x7ff));
+ return idx;
+ } else if (bld.shader->devinfo->gen >= 6) {
/* The render target array index is provided in the thread payload as
* bits 26:16 of r0.0.
*/
* invocations are already executed lock-step. Instead of an actual
* barrier just emit a scheduling fence, that will generate no code.
*/
- if (!cs_prog_data->uses_variable_group_size &&
+ if (!nir->info.cs.local_size_variable &&
workgroup_size() <= dispatch_width) {
bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
break;
}
case nir_intrinsic_load_local_group_size: {
+ assert(compiler->lower_variable_group_size);
+ assert(nir->info.cs.local_size_variable);
for (unsigned i = 0; i < 3; i++) {
bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
group_size[i]);
break;
}
+ case nir_intrinsic_load_simd_width_intel: {
+ bld.MOV(dest, brw_imm_ud(cs_prog_data->simd_size));
+ break;
+ };
+
default:
nir_emit_intrinsic(bld, instr);
break;
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
- case nir_intrinsic_memory_barrier: {
+ case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock: {
bool l3_fence, slm_fence;
- if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) {
+ const enum opcode opcode =
+ instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
+ SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_scoped_memory_barrier: {
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
l3_fence = modes & (nir_var_shader_out |
nir_var_mem_ssbo |
nir_var_mem_global);
slm_fence = modes & nir_var_mem_shared;
- } else {
+ break;
+ }
+
+ case nir_intrinsic_begin_invocation_interlock:
+ case nir_intrinsic_end_invocation_interlock:
+ /* For beginInvocationInterlockARB(), we will generate a memory fence
+ * but with a different opcode so that generator can pick SENDC
+ * instead of SEND.
+ *
+ * For endInvocationInterlockARB(), we need to insert a memory fence which
+ * stalls in the shader until the memory transactions prior to that
+ * fence are complete. This ensures that the shader does not end before
+ * any writes from its critical section have landed. Otherwise, you can
+ * end up with a case where the next invocation on that pixel properly
+ * stalls for previous FS invocation on its pixel to complete but
+ * doesn't actually wait for the dataport memory transactions from that
+ * thread to land before submitting its own.
+ *
+ * Handling them here will allow the logic for IVB render cache (see
+ * below) to be reused.
+ */
+ l3_fence = true;
+ slm_fence = false;
+ break;
+
+ default:
l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+ break;
}
if (stage != MESA_SHADER_COMPUTE)
*
* TODO: Check if applies for many HW threads sharing same Data Port.
*/
- if (!brw_cs_prog_data(prog_data)->uses_variable_group_size &&
+ if (!nir->info.cs.local_size_variable &&
slm_fence && workgroup_size() <= dispatch_width)
slm_fence = false;
l3_fence = true;
}
+ /* IVB does typed surface access through the render cache, so we need
+ * to flush it too.
+ */
+ const bool needs_render_fence =
+ devinfo->gen == 7 && !devinfo->is_haswell;
+
/* Be conservative in Gen11+ and always stall in a fence. Since there
* are two different fences, and shader might want to synchronize
* between them.
*
- * TODO: Improve NIR so that scope and visibility information for the
- * barriers is available here to make a better decision.
- *
- * TODO: When emitting more than one fence, it might help emit all
- * the fences first and then generate the stall moves.
+ * TODO: Use scope and visibility information for the barriers from NIR
+ * to make a better decision on whether we need to stall.
*/
- const bool stall = devinfo->gen >= 11;
+ const bool stall = devinfo->gen >= 11 || needs_render_fence ||
+ instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+ const bool commit_enable = stall ||
+ devinfo->gen >= 10; /* HSD ES # 1404612949 */
+
+ unsigned fence_regs_count = 0;
+ fs_reg fence_regs[2] = {};
const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
if (l3_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- /* bti */ brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ fence_regs[fence_regs_count++] = fence->dst;
+
+ if (needs_render_fence) {
+ fs_inst *render_fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(/* bti */ 0));
+ render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+
+ fence_regs[fence_regs_count++] = render_fence->dst;
+ }
}
if (slm_fence) {
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(stall),
- brw_imm_ud(GEN7_BTI_SLM))
- ->size_written = 2 * REG_SIZE;
+ assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+ fs_inst *fence =
+ ubld.emit(opcode,
+ ubld.vgrf(BRW_REGISTER_TYPE_UD),
+ brw_vec8_grf(0, 0),
+ brw_imm_ud(commit_enable),
+ brw_imm_ud(GEN7_BTI_SLM));
+ fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+ fence_regs[fence_regs_count++] = fence->dst;
}
- if (!l3_fence && !slm_fence)
- ubld.emit(FS_OPCODE_SCHEDULING_FENCE);
+ assert(fence_regs_count <= 2);
+
+ if (stall || fence_regs_count == 0) {
+ ubld.exec_all().group(1, 0).emit(
+ FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+ fence_regs, fence_regs_count);
+ }
break;
}
break;
}
- case nir_intrinsic_begin_invocation_interlock: {
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-
- ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
- case nir_intrinsic_end_invocation_interlock: {
- /* For endInvocationInterlock(), we need to insert a memory fence which
- * stalls in the shader until the memory transactions prior to that
- * fence are complete. This ensures that the shader does not end before
- * any writes from its critical section have landed. Otherwise, you can
- * end up with a case where the next invocation on that pixel properly
- * stalls for previous FS invocation on its pixel to complete but
- * doesn't actually wait for the dataport memory transactions from that
- * thread to land before submitting its own.
- */
- const fs_builder ubld = bld.group(8, 0);
- const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
- brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
- ->size_written = 2 * REG_SIZE;
- break;
- }
-
default:
unreachable("unknown intrinsic");
}