From 6132992cdb858268af0e985727d80e4140be389c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Aug 2017 21:56:43 -0700 Subject: [PATCH] intel/compiler/fs: Set up subgroup invocation as a system value Subgroup invocation is computed using a vector immediate and some dispatch-aware arithmetic. Unfortunately, due to the vector arithmetic, and the fact that it's frequently read 16-wide, it's not something that can easily be CSEd by the back-end compiler. There are a few different possible approaches to this problem: 1) Emit the code to calculate the subgroup invocation on-the-fly and trust NIR to do the CSE. This is what we were doing. 2) Add a back-end instruction for the subgroup ID. This has the advantage of helping the back-end compiler with CSE but has the downside of very poor scheduling for the calculation because it has to be emitted in the back-end. 3) Emit the calculation at the top of the program and re-use the result. This gets rid of the CSE problem but comes at the cost of an extra live register. This commit switches us from 1) to 3). We choose to store the subgroup invocation values as a W type to reduce the impact of the extra live register. Trusting NIR and using 1) was fine but we're soon going to want to use the subgroup invocation value for other things in the back-end compiler and this makes it much easier to do without having to worry about CSE problems. Reviewed-by: Iago Toral Quiroga --- src/intel/compiler/brw_fs_nir.cpp | 34 +++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 39e7e692874..35fae180285 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -231,6 +231,24 @@ fs_visitor::nir_emit_system_values() nir_system_values[i] = fs_reg(); } + /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we + * never end up using it. + */ + { + const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); + fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + reg = abld.vgrf(BRW_REGISTER_TYPE_W); + + const fs_builder allbld8 = abld.group(8, 0).exec_all(); + allbld8.MOV(reg, brw_imm_v(0x76543210)); + if (dispatch_width > 8) + allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); + if (dispatch_width > 16) { + const fs_builder allbld16 = abld.group(16, 0).exec_all(); + allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); + } + } + nir_foreach_function(function, nir) { assert(strcmp(function->name, "main") == 0); assert(function->impl); @@ -4170,20 +4188,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width)); break; - case nir_intrinsic_load_subgroup_invocation: { - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); - dest = retype(dest, BRW_REGISTER_TYPE_UD); - const fs_builder allbld8 = bld.group(8, 0).exec_all(); - allbld8.MOV(tmp, brw_imm_v(0x76543210)); - if (dispatch_width > 8) - allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u)); - if (dispatch_width > 16) { - const fs_builder allbld16 = bld.group(16, 0).exec_all(); - allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u)); - } - bld.MOV(dest, tmp); + case nir_intrinsic_load_subgroup_invocation: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), + nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); break; - } case nir_intrinsic_load_subgroup_eq_mask: case nir_intrinsic_load_subgroup_ge_mask: -- 2.30.2