radeonsi: add PS epilog
authorMarek Olšák <marek.olsak@amd.com>
Tue, 26 Jan 2016 23:50:29 +0000 (00:50 +0100)
committerMarek Olšák <marek.olsak@amd.com>
Sun, 21 Feb 2016 20:08:57 +0000 (21:08 +0100)
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h

index 4415f198c9061ac5a02f3d548ad27151ebdd0002..3c87dd690f7d469e40141b0f3e58938fec8e3512 100644 (file)
@@ -542,6 +542,7 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
                sscreen->vs_prologs,
                sscreen->vs_epilogs,
                sscreen->tcs_epilogs,
+               sscreen->ps_epilogs
        };
        unsigned i;
 
index d9175b932abd287d552033bc4a54b5d046315d8d..5d204ec6462eba6865552dbd20a2bcd506fa140b 100644 (file)
@@ -92,6 +92,7 @@ struct si_screen {
        struct si_shader_part           *vs_prologs;
        struct si_shader_part           *vs_epilogs;
        struct si_shader_part           *tcs_epilogs;
+       struct si_shader_part           *ps_epilogs;
 };
 
 struct si_blend_color {
index bcb5c17d39e19b39d2ab85df753095ad40e1293d..8e50d06b30fb48e82218887955d2829634b61737 100644 (file)
@@ -131,6 +131,11 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
                               LLVMTargetMachineRef tm,
                               struct tgsi_shader_info *info);
 
+/* Ideally pass the sample mask input to the PS epilog as v13, which
+ * is its usual location, so that the shader doesn't have to add v_mov.
+ */
+#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
+
 /* The VS location of the PrimitiveID input is the same in the epilog,
  * so that the main shader part doesn't have to move it.
  */
@@ -2530,6 +2535,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
                si_export_mrt_z(bld_base, depth, stencil, samplemask);
 }
 
+/**
+ * Return PS outputs in this order:
+ *
+ * v[0:3] = color0.xyzw
+ * v[4:7] = color1.xyzw
+ * ...
+ * vN+0 = Depth
+ * vN+1 = Stencil
+ * vN+2 = SampleMask
+ * vN+3 = SampleMaskIn (used for OpenGL smoothing)
+ *
+ * The alpha-ref SGPR is returned via its original location.
+ */
+static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       struct si_shader *shader = ctx->shader;
+       struct lp_build_context *base = &bld_base->base;
+       struct tgsi_shader_info *info = &shader->selector->info;
+       LLVMBuilderRef builder = base->gallivm->builder;
+       unsigned i, j, first_vgpr, vgpr;
+
+       LLVMValueRef color[8][4] = {};
+       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+       LLVMValueRef ret;
+
+       /* Read the output values. */
+       for (i = 0; i < info->num_outputs; i++) {
+               unsigned semantic_name = info->output_semantic_name[i];
+               unsigned semantic_index = info->output_semantic_index[i];
+
+               switch (semantic_name) {
+               case TGSI_SEMANTIC_COLOR:
+                       assert(semantic_index < 8);
+                       for (j = 0; j < 4; j++) {
+                               LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
+                               LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+                               color[semantic_index][j] = result;
+                       }
+                       break;
+               case TGSI_SEMANTIC_POSITION:
+                       depth = LLVMBuildLoad(builder,
+                                             ctx->radeon_bld.soa.outputs[i][2], "");
+                       break;
+               case TGSI_SEMANTIC_STENCIL:
+                       stencil = LLVMBuildLoad(builder,
+                                               ctx->radeon_bld.soa.outputs[i][1], "");
+                       break;
+               case TGSI_SEMANTIC_SAMPLEMASK:
+                       samplemask = LLVMBuildLoad(builder,
+                                                  ctx->radeon_bld.soa.outputs[i][0], "");
+                       break;
+               default:
+                       fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
+                               semantic_name);
+               }
+       }
+
+       /* Fill the return structure. */
+       ret = ctx->return_value;
+
+       /* Set SGPRs. */
+       ret = LLVMBuildInsertValue(builder, ret,
+                                  bitcast(bld_base, TGSI_TYPE_SIGNED,
+                                          LLVMGetParam(ctx->radeon_bld.main_fn,
+                                                       SI_PARAM_ALPHA_REF)),
+                                  SI_SGPR_ALPHA_REF, "");
+
+       /* Set VGPRs */
+       first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+       for (i = 0; i < ARRAY_SIZE(color); i++) {
+               if (!color[i][0])
+                       continue;
+
+               for (j = 0; j < 4; j++)
+                       ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+       }
+       if (depth)
+               ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+       if (stencil)
+               ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+       if (samplemask)
+               ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+       /* Add the input sample mask for smoothing at the end. */
+       if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+               vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+       ret = LLVMBuildInsertValue(builder, ret,
+                                  LLVMGetParam(ctx->radeon_bld.main_fn,
+                                               SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
+
+       ctx->return_value = ret;
+}
+
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
                                struct lp_build_tgsi_context *bld_base,
                                struct lp_build_emit_data *emit_data);
@@ -3723,7 +3822,7 @@ static void create_function(struct si_shader_context *ctx)
        struct si_shader *shader = ctx->shader;
        LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
        LLVMTypeRef returns[16+32*4];
-       unsigned i, last_array_pointer, last_sgpr, num_params;
+       unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
        unsigned num_returns = 0;
 
        v3i32 = LLVMVectorType(ctx->i32, 3);
@@ -3869,6 +3968,27 @@ static void create_function(struct si_shader_context *ctx)
                params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
                params[SI_PARAM_POS_FIXED_PT] = ctx->f32;
                num_params = SI_PARAM_POS_FIXED_PT+1;
+
+               if (!ctx->is_monolithic) {
+                       /* Outputs for the epilog. */
+                       num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+                       num_returns =
+                               num_return_sgprs +
+                               util_bitcount(shader->selector->info.colors_written) * 4 +
+                               shader->selector->info.writes_z +
+                               shader->selector->info.writes_stencil +
+                               shader->selector->info.writes_samplemask +
+                               1 /* SampleMaskIn */;
+
+                       num_returns = MAX2(num_returns,
+                                          num_return_sgprs +
+                                          PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+                       for (i = 0; i < num_return_sgprs; i++)
+                               returns[i] = ctx->i32;
+                       for (; i < num_returns; i++)
+                               returns[i] = ctx->f32;
+               }
                break;
 
        default:
@@ -4664,7 +4784,10 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen,
                break;
        case TGSI_PROCESSOR_FRAGMENT:
                ctx.radeon_bld.load_input = declare_input_fs;
-               bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+               if (is_monolithic)
+                       bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
+               else
+                       bld_base->emit_epilogue = si_llvm_return_fs_outputs;
                break;
        default:
                assert(!"Unsupported shader type");
@@ -5181,6 +5304,159 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
        return shader->epilog != NULL;
 }
 
+/**
+ * Compile the pixel shader epilog. This handles everything that must be
+ * emulated for pixel shader exports. (alpha-test, format conversions, etc)
+ */
+static bool si_compile_ps_epilog(struct si_screen *sscreen,
+                                LLVMTargetMachineRef tm,
+                                struct pipe_debug_callback *debug,
+                                struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
+       LLVMTypeRef params[16+8*4+3];
+       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+       int last_array_pointer, last_sgpr, num_params, i;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL);
+       ctx.type = TGSI_PROCESSOR_FRAGMENT;
+       shader.key.ps.epilog = key->ps_epilog.states;
+
+       /* Declare input SGPRs. */
+       params[SI_PARAM_RW_BUFFERS] = ctx.i64;
+       params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
+       params[SI_PARAM_SAMPLERS] = ctx.i64;
+       params[SI_PARAM_UNUSED] = ctx.i64;
+       params[SI_PARAM_ALPHA_REF] = ctx.f32;
+       last_array_pointer = -1;
+       last_sgpr = SI_PARAM_ALPHA_REF;
+
+       /* Declare input VGPRs. */
+       num_params = (last_sgpr + 1) +
+                    util_bitcount(key->ps_epilog.colors_written) * 4 +
+                    key->ps_epilog.writes_z +
+                    key->ps_epilog.writes_stencil +
+                    key->ps_epilog.writes_samplemask;
+
+       num_params = MAX2(num_params,
+                         last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+       assert(num_params <= ARRAY_SIZE(params));
+
+       for (i = last_sgpr + 1; i < num_params; i++)
+               params[i] = ctx.f32;
+
+       /* Create the function. */
+       si_create_function(&ctx, NULL, 0, params, num_params,
+                          last_array_pointer, last_sgpr);
+       /* Disable elimination of unused inputs. */
+       radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
+                                 "InitialPSInputAddr", 0xffffff);
+
+       /* Process colors. */
+       unsigned vgpr = last_sgpr + 1;
+       unsigned colors_written = key->ps_epilog.colors_written;
+       int last_color_export = -1;
+
+       /* Find the last color export. */
+       if (!key->ps_epilog.writes_z &&
+           !key->ps_epilog.writes_stencil &&
+           !key->ps_epilog.writes_samplemask) {
+               unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+               /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+               if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+                       /* Just set this if any of the colorbuffers are enabled. */
+                       if (spi_format &
+                           ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+                               last_color_export = 0;
+               } else {
+                       for (i = 0; i < 8; i++)
+                               if (colors_written & (1 << i) &&
+                                   (spi_format >> (i * 4)) & 0xf)
+                                       last_color_export = i;
+               }
+       }
+
+       while (colors_written) {
+               LLVMValueRef color[4];
+               int mrt = u_bit_scan(&colors_written);
+
+               for (i = 0; i < 4; i++)
+                       color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+               si_export_mrt_color(bld_base, color, mrt,
+                                   num_params - 1,
+                                   mrt == last_color_export);
+       }
+
+       /* Process depth, stencil, samplemask. */
+       if (key->ps_epilog.writes_z)
+               depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+       if (key->ps_epilog.writes_stencil)
+               stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+       if (key->ps_epilog.writes_samplemask)
+               samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
+
+       if (depth || stencil || samplemask)
+               si_export_mrt_z(bld_base, depth, stencil, samplemask);
+       else if (last_color_export == -1)
+               si_export_null(bld_base);
+
+       /* Compile. */
+       LLVMBuildRetVoid(gallivm->builder);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Fragment Shader Epilog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+/**
+ * Select and compile (or reuse) pixel shader parts (prolog & epilog).
+ */
+static bool si_shader_select_ps_parts(struct si_screen *sscreen,
+                                     LLVMTargetMachineRef tm,
+                                     struct si_shader *shader,
+                                     struct pipe_debug_callback *debug)
+{
+       struct tgsi_shader_info *info = &shader->selector->info;
+       union si_shader_part_key epilog_key;
+
+       /* Get the epilog. */
+       memset(&epilog_key, 0, sizeof(epilog_key));
+       epilog_key.ps_epilog.colors_written = info->colors_written;
+       epilog_key.ps_epilog.writes_z = info->writes_z;
+       epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
+       epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
+       epilog_key.ps_epilog.states = shader->key.ps.epilog;
+
+       shader->epilog =
+               si_get_shader_part(sscreen, &sscreen->ps_epilogs,
+                                  &epilog_key, tm, debug,
+                                  si_compile_ps_epilog);
+       if (!shader->epilog)
+               return false;
+
+       /* The sample mask input is always enabled, because the API shader always
+        * passes it through to the epilog. Disable it here if it's unused.
+        */
+       if (!shader->key.ps.epilog.poly_line_smoothing &&
+           !shader->selector->info.reads_samplemask)
+               shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+       return true;
+}
+
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                     struct si_shader *shader,
                     struct pipe_debug_callback *debug)
@@ -5207,6 +5483,16 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                        if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
                                return -1;
                        break;
+               case PIPE_SHADER_FRAGMENT:
+                       if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
+                               return -1;
+
+                       /* Make sure we have at least as many VGPRs as there
+                        * are allocated inputs.
+                        */
+                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+                                                       shader->num_input_vgprs);
+                       break;
                }
 
                /* Update SGPR and VGPR counts. */
index e2b9534683e9f35513aab2683fdabbcfe9508b0d..928cb2e18e981c7a28ed5102b6a1544799b061e2 100644 (file)
@@ -281,6 +281,13 @@ union si_shader_part_key {
        struct {
                struct si_tcs_epilog_bits states;
        } tcs_epilog;
+       struct {
+               struct si_ps_epilog_bits states;
+               unsigned        colors_written:8;
+               unsigned        writes_z:1;
+               unsigned        writes_stencil:1;
+               unsigned        writes_samplemask:1;
+       } ps_epilog;
 };
 
 union si_shader_key {