From cb6f16dce90b4737f62588f8ea5083ee6544787e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 1 Aug 2017 05:10:49 +0100 Subject: [PATCH] radeon/ac: use ds_swizzle for derivs on si/cik. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This looks like it's supported since llvm 3.9 at least, so switch over radeonsi and radv to using it, -pro also uses this. We can now drop creating lds for these operations as the ds_swizzle operation doesn't actually write to lds at all. Acked-by: Marek Olšák (stable requested due to fixing radv CIK conformance tests) Cc: mesa-stable@lists.freedesktop.org Signed-off-by: Dave Airlie --- src/amd/common/ac_llvm_build.c | 57 +++++++++++++++++------- src/amd/common/ac_llvm_build.h | 1 - src/amd/common/ac_nir_to_llvm.c | 9 +--- src/gallium/drivers/radeonsi/si_shader.c | 16 +------ 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 9b939c148e5..a38aad68f72 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -796,21 +796,21 @@ ac_build_ddxy(struct ac_llvm_context *ctx, bool has_ds_bpermute, uint32_t mask, int idx, - LLVMValueRef lds, LLVMValueRef val) { - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2]; + LLVMValueRef tl, trbl, args[2]; LLVMValueRef result; - thread_id = ac_get_thread_id(ctx); + if (has_ds_bpermute) { + LLVMValueRef thread_id, tl_tid, trbl_tid; + thread_id = ac_get_thread_id(ctx); - tl_tid = LLVMBuildAnd(ctx->builder, thread_id, - LLVMConstInt(ctx->i32, mask, false), ""); + tl_tid = LLVMBuildAnd(ctx->builder, thread_id, + LLVMConstInt(ctx->i32, mask, false), ""); - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, - LLVMConstInt(ctx->i32, idx, false), ""); + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, + LLVMConstInt(ctx->i32, idx, false), ""); - if (has_ds_bpermute) { args[0] = LLVMBuildMul(ctx->builder, tl_tid, LLVMConstInt(ctx->i32, 4, false), ""); args[1] = val; @@ -828,15 +828,42 @@ ac_build_ddxy(struct ac_llvm_context *ctx, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); } else { - LLVMValueRef store_ptr, load_ptr0, load_ptr1; + uint32_t masks[2]; + + switch (mask) { + case AC_TID_MASK_TOP_LEFT: + masks[0] = 0x8000; + if (idx == 1) + masks[1] = 0x8055; + else + masks[1] = 0x80aa; + + break; + case AC_TID_MASK_TOP: + masks[0] = 0x8044; + masks[1] = 0x80ee; + break; + case AC_TID_MASK_LEFT: + masks[0] = 0x80a0; + masks[1] = 0x80f5; + break; + } - store_ptr = ac_build_gep0(ctx, lds, thread_id); - load_ptr0 = ac_build_gep0(ctx, lds, tl_tid); - load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid); + args[0] = val; + args[1] = LLVMConstInt(ctx->i32, masks[0], false); - LLVMBuildStore(ctx->builder, val, store_ptr); - tl = LLVMBuildLoad(ctx->builder, load_ptr0, ""); - trbl = LLVMBuildLoad(ctx->builder, load_ptr1, ""); + tl = ac_build_intrinsic(ctx, + "llvm.amdgcn.ds.swizzle", ctx->i32, + args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + + args[1] = LLVMConstInt(ctx->i32, masks[1], false); + trbl = ac_build_intrinsic(ctx, + "llvm.amdgcn.ds.swizzle", ctx->i32, + args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); } tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 09fd585706a..ee27d3ca25c 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -174,7 +174,6 @@ ac_build_ddxy(struct ac_llvm_context *ctx, bool has_ds_bpermute, uint32_t mask, int idx, - LLVMValueRef lds, LLVMValueRef val); #define AC_SENDMSG_GS 2 diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index a05fd0e9cbe..3a6252395ba 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -68,8 +68,6 @@ struct ac_nir_context { int num_locals; LLVMValueRef *locals; - LLVMValueRef ddxy_lds; - struct nir_to_llvm_context *nctx; /* TODO get rid of this */ }; @@ -1463,11 +1461,6 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, LLVMValueRef result; bool has_ds_bpermute = ctx->abi->chip_class >= VI; - if (!ctx->ddxy_lds && !has_ds_bpermute) - ctx->ddxy_lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, - LLVMArrayType(ctx->ac.i32, 64), - "ddxy_lds", LOCAL_ADDR_SPACE); - if (op == nir_op_fddx_fine || op == nir_op_fddx) mask = AC_TID_MASK_LEFT; else if (op == nir_op_fddy_fine || op == nir_op_fddy) @@ -1484,7 +1477,7 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, idx = 2; result = ac_build_ddxy(&ctx->ac, has_ds_bpermute, - mask, idx, ctx->ddxy_lds, + mask, idx, src0); return result; } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 628e6f80d3f..09053c355eb 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3591,7 +3591,7 @@ static void si_llvm_emit_ddxy( val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, ""); val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute, - mask, idx, ctx->lds, val); + mask, idx, val); emit_data->output[emit_data->chan] = val; } @@ -4635,20 +4635,6 @@ static void create_function(struct si_shader_context *ctx) assert(shader->info.num_input_vgprs >= num_prolog_vgprs); shader->info.num_input_vgprs -= num_prolog_vgprs; - if (!ctx->screen->has_ds_bpermute && - bld_base->info && - (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0)) - ctx->lds = - LLVMAddGlobalInAddressSpace(gallivm->module, - LLVMArrayType(ctx->i32, 64), - "ddxy_lds", - LOCAL_ADDR_SPACE); - if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL || /* GFX9 has the ESGS ring buffer in LDS. */ -- 2.30.2