radeonsi: enable the barycentric optimization in all cases
authorMarek Olšák <marek.olsak@amd.com>
Thu, 30 Jun 2016 00:16:16 +0000 (02:16 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 4 Jul 2016 22:47:12 +0000 (00:47 +0200)
Handle the bc_optimize SGPR bit if both CENTER and CENTROID are enabled.
This should increase the PS launch rate for big primitives with MSAA.
Based on discussion with SPI guys.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index a59c28e75bf5460894372fea70d1fc405f202b1a..abd58855da97a62e56cba8d9c06b4eaefb6bfa8a 100644 (file)
@@ -1435,6 +1435,56 @@ static void interp_fs_input(struct si_shader_context *ctx,
        }
 }
 
+/* LLVMGetParam with bc_optimize resolved. */
+static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
+                                    int interp_param_idx)
+{
+       LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+       LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
+       LLVMValueRef param = NULL;
+
+       /* Handle PRIM_MASK[31] (bc_optimize). */
+       if (ctx->is_monolithic &&
+           ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
+             interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
+            (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
+             interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
+               /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+                * The hw doesn't compute CENTROID if the whole wave only
+                * contains fully-covered quads.
+                */
+               LLVMValueRef bc_optimize =
+                       LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
+               bc_optimize = LLVMBuildLShr(builder,
+                                           bc_optimize,
+                                           LLVMConstInt(ctx->i32, 31, 0), "");
+               bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
+
+               if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
+                   interp_param_idx == SI_PARAM_PERSP_CENTROID) {
+                       param = LLVMBuildSelect(builder, bc_optimize,
+                                               LLVMGetParam(main_fn,
+                                                            SI_PARAM_PERSP_CENTER),
+                                               LLVMGetParam(main_fn,
+                                                            SI_PARAM_PERSP_CENTROID),
+                                               "");
+               }
+               if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
+                   interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
+                       param = LLVMBuildSelect(builder, bc_optimize,
+                                               LLVMGetParam(main_fn,
+                                                            SI_PARAM_LINEAR_CENTER),
+                                               LLVMGetParam(main_fn,
+                                                            SI_PARAM_LINEAR_CENTROID),
+                                               "");
+               }
+       }
+
+       if (!param)
+               param = LLVMGetParam(main_fn, interp_param_idx);
+       return param;
+}
+
 static void declare_input_fs(
        struct radeon_llvm_context *radeon_bld,
        unsigned input_index,
@@ -1475,7 +1525,7 @@ static void declare_input_fs(
        else if (interp_param_idx) {
                interp_param_idx = select_interp_param(ctx,
                                                       interp_param_idx);
-               interp_param = LLVMGetParam(main_fn, interp_param_idx);
+               interp_param = get_interp_param(ctx, interp_param_idx);
        }
 
        interp_fs_input(ctx, input_index, decl->Semantic.Name,
@@ -5041,7 +5091,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
        if (interp_param_idx == -1)
                return;
        else if (interp_param_idx)
-               interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
+               interp_param = get_interp_param(ctx, interp_param_idx);
        else
                interp_param = NULL;
 
@@ -6398,6 +6448,8 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
                fprintf(f, "  prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
                fprintf(f, "  prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
                fprintf(f, "  prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
+               fprintf(f, "  prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
+               fprintf(f, "  prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
                fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
                fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
                fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
@@ -7192,6 +7244,55 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
                si_llvm_emit_polygon_stipple(&ctx, list, pos);
        }
 
+       if (key->ps_prolog.states.bc_optimize_for_persp ||
+           key->ps_prolog.states.bc_optimize_for_linear) {
+               unsigned i, base = key->ps_prolog.num_input_sgprs;
+               LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
+
+               /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+                * The hw doesn't compute CENTROID if the whole wave only
+                * contains fully-covered quads.
+                *
+                * PRIM_MASK is after user SGPRs.
+                */
+               bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+               bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
+                                           LLVMConstInt(ctx.i32, 31, 0), "");
+               bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
+                                            ctx.i1, "");
+
+               if (key->ps_prolog.states.bc_optimize_for_persp) {
+                       /* Read PERSP_CENTER. */
+                       for (i = 0; i < 2; i++)
+                               center[i] = LLVMGetParam(func, base + 2 + i);
+                       /* Read PERSP_CENTROID. */
+                       for (i = 0; i < 2; i++)
+                               centroid[i] = LLVMGetParam(func, base + 4 + i);
+                       /* Select PERSP_CENTROID. */
+                       for (i = 0; i < 2; i++) {
+                               tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
+                                                     center[i], centroid[i], "");
+                               ret = LLVMBuildInsertValue(gallivm->builder, ret,
+                                                          tmp, base + 4 + i, "");
+                       }
+               }
+               if (key->ps_prolog.states.bc_optimize_for_linear) {
+                       /* Read LINEAR_CENTER. */
+                       for (i = 0; i < 2; i++)
+                               center[i] = LLVMGetParam(func, base + 8 + i);
+                       /* Read LINEAR_CENTROID. */
+                       for (i = 0; i < 2; i++)
+                               centroid[i] = LLVMGetParam(func, base + 10 + i);
+                       /* Select LINEAR_CENTROID. */
+                       for (i = 0; i < 2; i++) {
+                               tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
+                                                     center[i], centroid[i], "");
+                               ret = LLVMBuildInsertValue(gallivm->builder, ret,
+                                                          tmp, base + 10 + i, "");
+                       }
+               }
+       }
+
        /* Interpolate colors. */
        for (i = 0; i < 2; i++) {
                unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
@@ -7208,8 +7309,11 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen,
                        unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
                                               key->ps_prolog.color_interp_vgpr_index[i];
 
-                       interp[0] = LLVMGetParam(func, interp_vgpr);
-                       interp[1] = LLVMGetParam(func, interp_vgpr + 1);
+                       /* Get the (i,j) updated by bc_optimize handling. */
+                       interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
+                                                         interp_vgpr, "");
+                       interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
+                                                         interp_vgpr + 1, "");
                        interp_ij = lp_build_gather_values(gallivm, interp, 2);
                        interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
                                                     ctx.v2i32, "");
@@ -7466,7 +7570,9 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
                 prolog_key.ps_prolog.states.force_persp_sample_interp ||
                 prolog_key.ps_prolog.states.force_linear_sample_interp ||
                 prolog_key.ps_prolog.states.force_persp_center_interp ||
-                prolog_key.ps_prolog.states.force_linear_center_interp);
+                prolog_key.ps_prolog.states.force_linear_center_interp ||
+                prolog_key.ps_prolog.states.bc_optimize_for_persp ||
+                prolog_key.ps_prolog.states.bc_optimize_for_linear);
 
        if (info->colors_read) {
                unsigned *color = shader->selector->color_attr_index;
@@ -7557,6 +7663,8 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen,
            prolog_key.ps_prolog.states.force_linear_sample_interp ||
            prolog_key.ps_prolog.states.force_persp_center_interp ||
            prolog_key.ps_prolog.states.force_linear_center_interp ||
+           prolog_key.ps_prolog.states.bc_optimize_for_persp ||
+           prolog_key.ps_prolog.states.bc_optimize_for_linear ||
            prolog_key.ps_prolog.states.poly_stipple) {
                shader->prolog =
                        si_get_shader_part(sscreen, &sscreen->ps_prologs,
index 064773605fb1c09bd61d6e168e3510a961b23501..3b7b3e155b37bd466a3aab6a9861f66fe099d879 100644 (file)
@@ -317,11 +317,8 @@ struct si_ps_prolog_bits {
        unsigned        force_linear_sample_interp:1;
        unsigned        force_persp_center_interp:1;
        unsigned        force_linear_center_interp:1;
-       /* TODO:
-        * - add force_center_interp_bc_optimize to force center interpolation
-        *   based on the bc_optimize SGPR bit if MSAA is enabled, centroid is
-        *   present and sample isn't present.
-        */
+       unsigned        bc_optimize_for_persp:1;
+       unsigned        bc_optimize_for_linear:1;
 };
 
 /* Common PS bits between the shader key and the epilog key. */
index d679825914d275208a6bdc6ee78299e9c36374ec..dc4f187bc65e4b3cf51a8d87e5599fe7903fc625 100644 (file)
@@ -664,7 +664,6 @@ static void si_shader_ps(struct si_shader *shader)
        unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
        unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
        uint64_t va;
-       bool has_centroid;
        unsigned input_ena = shader->config.spi_ps_input_ena;
 
        /* we need to enable at least one of them, otherwise we hang the GPU */
@@ -729,11 +728,7 @@ static void si_shader_ps(struct si_shader *shader)
                       shader->config.spi_ps_input_addr);
 
        /* Set interpolation controls. */
-       has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
-                      G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
-
-       spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
-                           S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
+       spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader));
 
        /* Set registers. */
        si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
@@ -946,8 +941,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                                key->ps.prolog.force_linear_sample_interp =
                                        sel->info.uses_linear_center ||
                                        sel->info.uses_linear_centroid;
-                       } else if (!rs->multisample_enable ||
-                                  sctx->framebuffer.nr_samples <= 1) {
+                       } else if (rs->multisample_enable &&
+                                  sctx->framebuffer.nr_samples > 1) {
+                               key->ps.prolog.bc_optimize_for_persp =
+                                       sel->info.uses_persp_center &&
+                                       sel->info.uses_persp_centroid;
+                               key->ps.prolog.bc_optimize_for_linear =
+                                       sel->info.uses_linear_center &&
+                                       sel->info.uses_linear_centroid;
+                       } else {
                                /* Make sure SPI doesn't compute more than 1 pair
                                 * of (i,j), which is the optimization here. */
                                key->ps.prolog.force_persp_center_interp =