uniforms = nir->num_uniforms / 4;
if (stage == MESA_SHADER_COMPUTE) {
- /* Add a uniform for the thread local id. It must be the last uniform
- * on the list.
- */
+ /* Add uniforms for builtins after regular NIR uniforms. */
assert(uniforms == prog_data->nr_params);
- uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+
+ uint32_t *param;
+ if (brw_cs_prog_data(prog_data)->uses_variable_group_size) {
+ param = brw_stage_prog_data_add_params(prog_data, 3);
+ for (unsigned i = 0; i < 3; i++) {
+ param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
+ group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+ }
+ }
+
+ /* Subgroup ID must be the last uniform on the list. This will make
+ * easier later to split between cross thread and per thread
+ * uniforms.
+ */
+ param = brw_stage_prog_data_add_params(prog_data, 1);
*param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
}
case nir_op_b32any_inequal4:
unreachable("Lowered by nir_lower_alu_reductions");
- case nir_op_fnoise1_1:
- case nir_op_fnoise1_2:
- case nir_op_fnoise1_3:
- case nir_op_fnoise1_4:
- case nir_op_fnoise2_1:
- case nir_op_fnoise2_2:
- case nir_op_fnoise2_3:
- case nir_op_fnoise2_4:
- case nir_op_fnoise3_1:
- case nir_op_fnoise3_2:
- case nir_op_fnoise3_3:
- case nir_op_fnoise3_4:
- case nir_op_fnoise4_1:
- case nir_op_fnoise4_2:
- case nir_op_fnoise4_3:
- case nir_op_fnoise4_4:
- unreachable("not reached: should be handled by lower_noise");
-
case nir_op_ldexp:
unreachable("not reached: should be handled by ldexp_to_arith()");
* invocations are already executed lock-step. Instead of an actual
* barrier just emit a scheduling fence, that will generate no code.
*/
- if (workgroup_size() <= dispatch_width) {
+ if (!cs_prog_data->uses_variable_group_size &&
+ workgroup_size() <= dispatch_width) {
bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
break;
}
break;
}
+ case nir_intrinsic_load_local_group_size: {
+ for (unsigned i = 0; i < 3; i++) {
+ bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+ group_size[i]);
+ }
+ break;
+ }
+
default:
nir_emit_intrinsic(bld, instr);
break;
*
* TODO: Check if applies for many HW threads sharing same Data Port.
*/
- if (slm_fence && workgroup_size() <= dispatch_width)
+ if (!brw_cs_prog_data(prog_data)->uses_variable_group_size &&
+ slm_fence && workgroup_size() <= dispatch_width)
slm_fence = false;
/* Prior to Gen11, there's only L3 fence, so emit that instead. */