radeonsi: optimize TCS epilog when invocation 0 writes tess factors
authorMarek Olšák <marek.olsak@amd.com>
Tue, 5 Sep 2017 11:40:59 +0000 (13:40 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 11 Sep 2017 17:02:02 +0000 (19:02 +0200)
This removes the barrier and LDS stores and loads for tess factors
when it's possible. The removal of the barrier seems more important
to me though.

In one shader, it removes 17 * 4 bytes from the shader binary.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/auxiliary/tgsi/tgsi_scan.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_shader_internal.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index b8932891e4cc768df2c64317ded43a113a915927..212d1bb95a82aa48917d4b8e6ab67f97d9ca0d2d 100644 (file)
@@ -1139,7 +1139,6 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens,
          if (main_block_tf_writemask || cond_block_tf_writemask) {
             /* Accumulate the result: */
             out->tessfactors_are_def_in_all_invocs &=
-               main_block_tf_writemask &&
                !(cond_block_tf_writemask & ~main_block_tf_writemask);
 
             /* Analyze the next code segment from scratch. */
@@ -1155,7 +1154,6 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens,
    /* Accumulate the result for the last code segment separated by a barrier. */
    if (main_block_tf_writemask || cond_block_tf_writemask) {
       out->tessfactors_are_def_in_all_invocs &=
-         main_block_tf_writemask &&
          !(cond_block_tf_writemask & ~main_block_tf_writemask);
    }
 
index e7888e6012b6d6f0a2ed3724cbc14d562dd948d9..43619dd329dfd634e254b97d8002bdbac91649ed 100644 (file)
@@ -1150,7 +1150,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
        LLVMValueRef buffer, base, buf_addr;
        LLVMValueRef values[4];
        bool skip_lds_store;
-       bool is_tess_factor = false;
+       bool is_tess_factor = false, is_tess_inner = false;
 
        /* Only handle per-patch and per-vertex outputs here.
         * Vectors will be lowered to scalars and this function will be called again.
@@ -1177,8 +1177,11 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
                        /* Always write tess factors into LDS for the TCS epilog. */
                        if (name == TGSI_SEMANTIC_TESSINNER ||
                            name == TGSI_SEMANTIC_TESSOUTER) {
-                               skip_lds_store = false;
+                               /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
+                               skip_lds_store = !sh_info->reads_tessfactor_outputs &&
+                                                ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
                                is_tess_factor = true;
+                               is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
                        }
                }
        }
@@ -1207,6 +1210,18 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
                                                    buf_addr, base,
                                                    4 * chan_index, 1, 0, true, false);
                }
+
+               /* Write tess factors into VGPRs for the epilog. */
+               if (is_tess_factor &&
+                   ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
+                       if (!is_tess_inner) {
+                               LLVMBuildStore(gallivm->builder, value, /* outer */
+                                              ctx->invoc0_tess_factors[chan_index]);
+                       } else if (chan_index < 2) {
+                               LLVMBuildStore(gallivm->builder, value, /* inner */
+                                              ctx->invoc0_tess_factors[4 + chan_index]);
+                       }
+               }
        }
 
        if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
@@ -2671,7 +2686,9 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                  LLVMValueRef rel_patch_id,
                                  LLVMValueRef invocation_id,
-                                 LLVMValueRef tcs_out_current_patch_data_offset)
+                                 LLVMValueRef tcs_out_current_patch_data_offset,
+                                 LLVMValueRef invoc0_tf_outer[4],
+                                 LLVMValueRef invoc0_tf_inner[2])
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
@@ -2682,7 +2699,9 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
        unsigned stride, outer_comps, inner_comps, i, offset;
        struct lp_build_if_state if_ctx, inner_if_ctx;
 
-       si_llvm_emit_barrier(NULL, bld_base, NULL);
+       /* Add a barrier before loading tess factors from LDS. */
+       if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+               si_llvm_emit_barrier(NULL, bld_base, NULL);
 
        /* Do this only for invocation 0, because the tess levels are per-patch,
         * not per-vertex.
@@ -2716,32 +2735,32 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                return;
        }
 
-       /* Load tess_inner and tess_outer from LDS.
-        * Any invocation can write them, so we can't get them from a temporary.
-        */
-       tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
-       tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
-       lds_base = tcs_out_current_patch_data_offset;
-       lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
-                                LLVMConstInt(ctx->i32,
-                                             tess_inner_index * 4, 0), "");
-       lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
-                                LLVMConstInt(ctx->i32,
-                                             tess_outer_index * 4, 0), "");
-
        for (i = 0; i < 4; i++) {
                inner[i] = LLVMGetUndef(ctx->i32);
                outer[i] = LLVMGetUndef(ctx->i32);
        }
 
-       if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
-               /* For isolines, the hardware expects tess factors in the
-                * reverse order from what GLSL / TGSI specify.
-                */
-               outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
-               outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
+       if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+               /* Tess factors are in VGPRs. */
+               for (i = 0; i < outer_comps; i++)
+                       outer[i] = out[i] = invoc0_tf_outer[i];
+               for (i = 0; i < inner_comps; i++)
+                       inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
        } else {
+               /* Load tess_inner and tess_outer from LDS.
+                * Any invocation can write them, so we can't get them from a temporary.
+                */
+               tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+               tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+               lds_base = tcs_out_current_patch_data_offset;
+               lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+                                        LLVMConstInt(ctx->i32,
+                                                     tess_inner_index * 4, 0), "");
+               lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+                                        LLVMConstInt(ctx->i32,
+                                                     tess_outer_index * 4, 0), "");
+
                for (i = 0; i < outer_comps; i++) {
                        outer[i] = out[i] =
                                lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
@@ -2752,6 +2771,15 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                }
        }
 
+       if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+               /* For isolines, the hardware expects tess factors in the
+                * reverse order from what GLSL / TGSI specify.
+                */
+               LLVMValueRef tmp = out[0];
+               out[0] = out[1];
+               out[1] = tmp;
+       }
+
        /* Convert the outputs to vectors for stores. */
        vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
        vec1 = NULL;
@@ -2946,7 +2974,18 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 
        ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
        ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-       ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+
+       if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
+               vgpr++; /* skip the tess factor LDS offset */
+               for (unsigned i = 0; i < 6; i++) {
+                       LLVMValueRef value =
+                               LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
+                       value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+                       ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
+               }
+       } else {
+               ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+       }
        ctx->return_value = ret;
 }
 
@@ -4330,7 +4369,7 @@ static void create_function(struct si_shader_context *ctx)
                 */
                for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
                        returns[num_returns++] = ctx->i32; /* SGPRs */
-               for (i = 0; i < 5; i++)
+               for (i = 0; i < 11; i++)
                        returns[num_returns++] = ctx->f32; /* VGPRs */
                break;
 
@@ -4387,7 +4426,7 @@ static void create_function(struct si_shader_context *ctx)
                         */
                        for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
                                returns[num_returns++] = ctx->i32; /* SGPRs */
-                       for (i = 0; i < 5; i++)
+                       for (i = 0; i < 11; i++)
                                returns[num_returns++] = ctx->f32; /* VGPRs */
                }
                break;
@@ -5692,6 +5731,14 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                }
        }
 
+       if (ctx->type == PIPE_SHADER_TESS_CTRL &&
+           sel->tcs_info.tessfactors_are_def_in_all_invocs) {
+               for (unsigned i = 0; i < 6; i++) {
+                       ctx->invoc0_tess_factors[i] =
+                               lp_build_alloca_undef(&ctx->gallivm, ctx->i32, "");
+               }
+       }
+
        if (ctx->type == PIPE_SHADER_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
@@ -6926,16 +6973,24 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
        add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
        add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
 
+       for (unsigned i = 0; i < 6; i++)
+               add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
+
        /* Create the function. */
        si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
                           ctx->screen->b.chip_class >= CIK ? 128 : 64);
        declare_lds_as_pointer(ctx);
        func = ctx->main_fn;
 
+       LLVMValueRef invoc0_tess_factors[6];
+       for (unsigned i = 0; i < 6; i++)
+               invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
+
        si_write_tess_factors(bld_base,
                              LLVMGetParam(func, tess_factors_idx),
                              LLVMGetParam(func, tess_factors_idx + 1),
-                             LLVMGetParam(func, tess_factors_idx + 2));
+                             LLVMGetParam(func, tess_factors_idx + 2),
+                             invoc0_tess_factors, invoc0_tess_factors + 4);
 
        LLVMBuildRetVoid(gallivm->builder);
 }
index ee6b0c167f9e14e5709e034181e309a11ae7730c..4592ac551cedc8260a60e811e31e660da681cd16 100644 (file)
@@ -327,6 +327,7 @@ struct si_shader_selector {
        struct nir_shader       *nir;
        struct pipe_stream_output_info  so;
        struct tgsi_shader_info         info;
+       struct tgsi_tessctrl_info       tcs_info;
 
        /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
        unsigned        type;
@@ -404,6 +405,7 @@ struct si_vs_prolog_bits {
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
        unsigned        prim_mode:3;
+       unsigned        invoc0_tess_factors_are_def:1;
        unsigned        tes_reads_tess_factors:1;
 };
 
index ad29ab7e8459304862add51262cba0ec5fb43b38..023f9a6a09339e0c39983ed53e9c789cea0c9f80 100644 (file)
@@ -209,6 +209,7 @@ struct si_shader_context {
        LLVMValueRef gsvs_ring[4];
 
        LLVMValueRef lds;
+       LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
        LLVMValueRef gs_next_vertex[4];
        LLVMValueRef postponed_kill;
        LLVMValueRef return_value;
index 9f76551cfbbbb40327534edb85695d4c66b89da5..6398111e5a673ea3c94f9ecd966a6335578f6a28 100644 (file)
@@ -1301,6 +1301,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
                key->part.tcs.epilog.prim_mode =
                        sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+               key->part.tcs.epilog.invoc0_tess_factors_are_def =
+                       sel->tcs_info.tessfactors_are_def_in_all_invocs;
                key->part.tcs.epilog.tes_reads_tess_factors =
                        sctx->tes_shader.cso->info.reads_tess_factors;
 
@@ -2004,6 +2006,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                }
 
                tgsi_scan_shader(state->tokens, &sel->info);
+               tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info);
        } else {
                assert(state->type == PIPE_SHADER_IR_NIR);