From 7f77554b5b224217ec1b3ebbf0fab0913c42e269 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Sun, 23 Apr 2017 19:52:34 +0100 Subject: [PATCH] radv/ac: setup mrt exports then export them in one go. (v2) Noticed while looking at Sascha Willems deferred shaders. This is a bit of an llvm workaround, llvm was producing this: v_cvt_pkrtz_f16_f32_e64 v4, v7, v8 ; D2960004 00021107 v_cvt_pkrtz_f16_f32_e64 v6, v9, 1.0 ; D2960006 0001E509 s_waitcnt vmcnt(0) ; BF8C0F70 exp mrt0 v4, v4, v6, v6 compr ; C400040F 00000604 s_waitcnt expcnt(0) ; BF8C0F0F v_cvt_pkrtz_f16_f32_e64 v4, v12, v5 ; D2960004 00020B0C v_cvt_pkrtz_f16_f32_e64 v5, v14, 1.0 ; D2960005 0001E50E exp mrt1 v4, v4, v5, v5 compr ; C400041F 00000504 s_waitcnt expcnt(0) ; BF8C0F0F v_cvt_pkrtz_f16_f32_e64 v0, v0, v1 ; D2960000 00020300 v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; D2960001 00020702 exp mrt2 v0, v0, v1, v1 done compr vm ; C4001C2F 00000100 After this change: v_cvt_pkrtz_f16_f32_e64 v4, v7, v8 ; D2960004 00021107 s_waitcnt vmcnt(0) ; BF8C0F70 v_cvt_pkrtz_f16_f32_e64 v0, v0, v1 ; D2960000 00020300 v_cvt_pkrtz_f16_f32_e64 v6, v9, 1.0 ; D2960006 0001E509 v_cvt_pkrtz_f16_f32_e64 v5, v12, v5 ; D2960005 00020B0C v_cvt_pkrtz_f16_f32_e64 v7, v14, 1.0 ; D2960007 0001E50E exp mrt0 v4, v4, v6, v6 compr ; C400040F 00000604 v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; D2960001 00020702 exp mrt1 v5, v5, v7, v7 compr ; C400041F 00000705 exp mrt2 v0, v0, v1, v1 done compr vm ; C4001C2F 00000100 No waitcnt for exports are emitted. v2: fixup index->mrt mapping (Bas). Reviewed-by: Bas Nieuwenhuizen Signed-off-by: Dave Airlie --- src/amd/common/ac_nir_to_llvm.c | 34 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index ab929bc81fe..97cd981ec47 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -5572,24 +5572,22 @@ handle_tcs_outputs_post(struct nir_to_llvm_context *ctx) write_tess_factors(ctx); } -static void +static bool si_export_mrt_color(struct nir_to_llvm_context *ctx, - LLVMValueRef *color, unsigned param, bool is_last) + LLVMValueRef *color, unsigned param, bool is_last, + struct ac_export_args *args) { - - struct ac_export_args args; - /* Export */ si_llvm_init_export_args(ctx, color, param, - &args); + args); if (is_last) { - args.valid_mask = 1; /* whether the EXEC mask is valid */ - args.done = 1; /* DONE bit */ - } else if (!args.enabled_channels) - return; /* unnecessary NULL export */ + args->valid_mask = 1; /* whether the EXEC mask is valid */ + args->done = 1; /* DONE bit */ + } else if (!args->enabled_channels) + return false; /* unnecessary NULL export */ - ac_build_export(&ctx->ac, &args); + return true; } static void @@ -5639,6 +5637,7 @@ handle_fs_outputs_post(struct nir_to_llvm_context *ctx) { unsigned index = 0; LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + struct ac_export_args color_args[8]; for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) { LLVMValueRef values[4]; @@ -5667,15 +5666,20 @@ handle_fs_outputs_post(struct nir_to_llvm_context *ctx) if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil && !ctx->shader_info->fs.writes_sample_mask) last = ctx->output_mask <= ((1ull << (i + 1)) - 1); - si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + index, last); - index++; + bool ret = si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + (i - FRAG_RESULT_DATA0), last, &color_args[index]); + if (ret) + index++; } } + for (unsigned i = 0; i < index; i++) + ac_build_export(&ctx->ac, &color_args[i]); if (depth || stencil || samplemask) si_export_mrt_z(ctx, depth, stencil, samplemask); - else if (!index) - si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true); + else if (!index) { + si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true, &color_args[0]); + ac_build_export(&ctx->ac, &color_args[0]); + } ctx->shader_info->fs.output_mask = index ? ((1ull << index) - 1) : 0; } -- 2.30.2