From: Jason Ekstrand Date: Fri, 1 Sep 2017 22:18:02 +0000 (-0700) Subject: intel/fs: Add support for subgroup quad operations X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8b4a5e641bc3cb9cf0cfe7d0487926127fc25de7;p=mesa.git intel/fs: Add support for subgroup quad operations NIR has code to lower these away for us but we can do significantly better in many cases with register regioning and SIMD4x2. Acked-by: Lionel Landwerlin Reviewed-by: Iago Toral Quiroga --- diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 3c4c538ac17..332d627bc37 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -467,6 +467,11 @@ enum opcode { */ SHADER_OPCODE_SEL_EXEC, + /* This turns into an align16 mov from src0 to dst with a swizzle + * provided as an immediate in src1. + */ + SHADER_OPCODE_QUAD_SWIZZLE, + /* Take every Nth element in src0 and broadcast it to the group of N * channels in which it lives in the destination. The offset within the * cluster is given by src1 and the cluster size is given by src2. diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 9f1b8d0b184..53ba94ccccb 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5233,6 +5233,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: return MIN2(8, inst->exec_size); + case SHADER_OPCODE_QUAD_SWIZZLE: + return 8; + case SHADER_OPCODE_MOV_INDIRECT: { /* From IVB and HSW PRMs: * diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 382548f5c36..5371246fd24 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2301,6 +2301,26 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_MOV(p, dst, src[0]); break; + case SHADER_OPCODE_QUAD_SWIZZLE: + /* This only works on 8-wide 32-bit values */ + assert(inst->exec_size == 8); + assert(type_sz(src[0].type) == 4); + assert(inst->force_writemask_all); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[1].type == BRW_REGISTER_TYPE_UD); + + if (src[0].file == BRW_IMMEDIATE_VALUE || + (src[0].vstride == 0 && src[0].hstride == 0)) { + /* The value is uniform across all channels */ + brw_MOV(p, dst, src[0]); + } else { + brw_set_default_access_mode(p, BRW_ALIGN_16); + struct brw_reg swiz_src = stride(src[0], 4, 4, 1); + swiz_src.swizzle = inst->src[1].ud; + brw_MOV(p, dst, swiz_src); + } + break; + case SHADER_OPCODE_CLUSTER_BROADCAST: { assert(src[0].type == dst.type); assert(!src[0].negate && !src[0].abs); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index e25fd38af91..dbd2105f7e9 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4588,6 +4588,100 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_quad_broadcast: { + const fs_reg value = get_nir_src(instr->src[0]); + nir_const_value *index = nir_src_as_const_value(instr->src[1]); + assert(nir_src_bit_size(instr->src[1]) == 32); + + bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), + value, brw_imm_ud(index->u32[0]), brw_imm_ud(4)); + break; + } + + case nir_intrinsic_quad_swap_horizontal: { + const fs_reg value = get_nir_src(instr->src[0]); + const fs_reg tmp = bld.vgrf(value.type); + const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); + + const fs_reg src_left = horiz_stride(value, 2); + const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); + const fs_reg tmp_left = horiz_stride(tmp, 2); + const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); + + /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, regioning in Align1 must follow + * these rules: + * + * [...] + * + * 3. Source and Destination offset must be the same, except + * the case of scalar source." + * + * In order to work around this, we have to emit two 32-bit MOVs instead + * of a single 64-bit MOV to do the shuffle. + */ + if (type_sz(value.type) > 4 && + (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { + ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0), + subscript(src_right, BRW_REGISTER_TYPE_D, 0)); + ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1), + subscript(src_right, BRW_REGISTER_TYPE_D, 1)); + ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0), + subscript(src_left, BRW_REGISTER_TYPE_D, 0)); + ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1), + subscript(src_left, BRW_REGISTER_TYPE_D, 1)); + } else { + ubld.MOV(tmp_left, src_right); + ubld.MOV(tmp_right, src_left); + } + bld.MOV(retype(dest, value.type), tmp); + break; + } + + case nir_intrinsic_quad_swap_vertical: { + const fs_reg value = get_nir_src(instr->src[0]); + if (nir_src_bit_size(instr->src[0]) == 32) { + /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ + const fs_reg tmp = bld.vgrf(value.type); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, + brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); + bld.MOV(retype(dest, value.type), tmp); + } else { + /* For larger data types, we have to either emit dispatch_width many + * MOVs or else fall back to doing indirects. + */ + fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], + brw_imm_w(0x2)); + bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); + } + break; + } + + case nir_intrinsic_quad_swap_diagonal: { + const fs_reg value = get_nir_src(instr->src[0]); + if (nir_src_bit_size(instr->src[0]) == 32) { + /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ + const fs_reg tmp = bld.vgrf(value.type); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, + brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); + bld.MOV(retype(dest, value.type), tmp); + } else { + /* For larger data types, we have to either emit dispatch_width many + * MOVs or else fall back to doing indirects. + */ + fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); + bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], + brw_imm_w(0x3)); + bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); + } + break; + } + case nir_intrinsic_reduce: { fs_reg src = get_nir_src(instr->src[0]); nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index e822c100e9f..ffe8a7403da 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -334,6 +334,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "shuffle"; case SHADER_OPCODE_SEL_EXEC: return "sel_exec"; + case SHADER_OPCODE_QUAD_SWIZZLE: + return "quad_swizzle"; case SHADER_OPCODE_CLUSTER_BROADCAST: return "cluster_broadcast";