X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs_nir.cpp;h=1a8306c9fda6fdf8728e270af01deac265299e0f;hb=7c43b8ce1b82f41e03147f824e87195ca8f1cb49;hp=f1d17a322e959e98489fd935df5e38dfd03226e3;hpb=c6439792287f11f25cb2b62d699f52daefe54a44;p=mesa.git diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index f1d17a322e9..1a8306c9fda 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -101,11 +101,23 @@ fs_visitor::nir_setup_uniforms() uniforms = nir->num_uniforms / 4; if (stage == MESA_SHADER_COMPUTE) { - /* Add a uniform for the thread local id. It must be the last uniform - * on the list. - */ + /* Add uniforms for builtins after regular NIR uniforms. */ assert(uniforms == prog_data->nr_params); - uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); + + uint32_t *param; + if (brw_cs_prog_data(prog_data)->uses_variable_group_size) { + param = brw_stage_prog_data_add_params(prog_data, 3); + for (unsigned i = 0; i < 3; i++) { + param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); + group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); + } + } + + /* Subgroup ID must be the last uniform on the list. This will make + * easier later to split between cross thread and per thread + * uniforms. + */ + param = brw_stage_prog_data_add_params(prog_data, 1); *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); } @@ -1594,24 +1606,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_b32any_inequal4: unreachable("Lowered by nir_lower_alu_reductions"); - case nir_op_fnoise1_1: - case nir_op_fnoise1_2: - case nir_op_fnoise1_3: - case nir_op_fnoise1_4: - case nir_op_fnoise2_1: - case nir_op_fnoise2_2: - case nir_op_fnoise2_3: - case nir_op_fnoise2_4: - case nir_op_fnoise3_1: - case nir_op_fnoise3_2: - case nir_op_fnoise3_3: - case nir_op_fnoise3_4: - case nir_op_fnoise4_1: - case nir_op_fnoise4_2: - case nir_op_fnoise4_3: - case nir_op_fnoise4_4: - unreachable("not reached: should be handled by lower_noise"); - case nir_op_ldexp: unreachable("not reached: should be handled by ldexp_to_arith()"); @@ -3814,7 +3808,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, * invocations are already executed lock-step. Instead of an actual * barrier just emit a scheduling fence, that will generate no code. */ - if (workgroup_size() <= dispatch_width) { + if (!cs_prog_data->uses_variable_group_size && + workgroup_size() <= dispatch_width) { bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); break; } @@ -3949,6 +3944,14 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_load_local_group_size: { + for (unsigned i = 0; i < 3; i++) { + bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), + group_size[i]); + } + break; + } + default: nir_emit_intrinsic(bld, instr); break; @@ -4337,7 +4340,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * * TODO: Check if applies for many HW threads sharing same Data Port. */ - if (slm_fence && workgroup_size() <= dispatch_width) + if (!brw_cs_prog_data(prog_data)->uses_variable_group_size && + slm_fence && workgroup_size() <= dispatch_width) slm_fence = false; /* Prior to Gen11, there's only L3 fence, so emit that instead. */