From 03255da225a85315241ea514e5f9e06090b16abe Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 4 Jun 2019 11:39:25 -0500 Subject: [PATCH] intel/fs: Do 8-bit subgroup scan operations in 16 bits Reviewed-by: Paulo Zanoni --- src/intel/compiler/brw_fs_nir.cpp | 42 ++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index abce29c92f5..f9e81e14f3a 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4925,10 +4925,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); bld.emit_scan(brw_op, scan, cluster_size, cond_mod); @@ -4971,10 +4989,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr opcode brw_op = brw_op_for_nir_reduction_op(redop); brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + /* There are a couple of register region issues that make things + * complicated for 8-bit types: + * + * 1. Only raw moves are allowed to write to a packed 8-bit + * destination. + * 2. If we use a strided destination, the efficient way to do scan + * operations ends up using strides that are too big to encode in + * an instruction. + * + * To get around these issues, we just do all 8-bit scan operations in + * 16 bits. It's actually fewer instructions than what we'd have to do + * if we were trying to do it in native 8-bit types and the results are + * the same once we truncate to 8 bits at the end. + */ + brw_reg_type scan_type = src.type; + if (type_sz(scan_type) == 1) + scan_type = brw_reg_type_from_bit_size(16, src.type); + /* Set up a register for all of our scratching around and initialize it * to reduction operation's identity value. */ - fs_reg scan = bld.vgrf(src.type); + fs_reg scan = bld.vgrf(scan_type); const fs_builder allbld = bld.exec_all(); allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); @@ -4983,7 +5019,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * shift of the contents before we can begin. To make things worse, * we can't do this with a normal stride; we have to use indirects. */ - fs_reg shifted = bld.vgrf(src.type); + fs_reg shifted = bld.vgrf(scan_type); fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], brw_imm_w(-1)); -- 2.30.2