radeonsi/gfx9: add workarounds to avoid VGPR indexing completely
authorMarek Olšák <marek.olsak@amd.com>
Wed, 5 Jul 2017 21:33:13 +0000 (23:33 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 17 Jul 2017 14:50:39 +0000 (10:50 -0400)
For inputs and outputs, indirect indexing is lowered by the GLSL compiler.
For temporaries, use alloca and disable the "promote-alloca" pass.

In the future, we could switch all codepaths to alloca permanently and
just rely on the "promote-alloca" pass.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c

index afb2bcbf078620b04599adfec8982eceaff66ed6..8a4bc41a4edb369a5ed629a79b165289054be597 100644 (file)
@@ -141,8 +141,9 @@ si_create_llvm_target_machine(struct si_screen *sscreen)
        char features[256];
 
        snprintf(features, sizeof(features),
-                "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s",
+                "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s",
                 sscreen->b.chip_class >= GFX9 ? ",+xnack" : ",-xnack",
+                sscreen->llvm_has_working_vgpr_indexing ? "" : ",-promote-alloca",
                 sscreen->b.debug_flags & DBG_SI_SCHED ? ",+si-scheduler" : "");
 
        return LLVMCreateTargetMachine(ac_get_llvm_target(triple), triple,
@@ -757,7 +758,6 @@ static int si_get_shader_param(struct pipe_screen* pscreen,
        /* Supported boolean features. */
        case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
        case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-       case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
        case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
        case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
        case PIPE_SHADER_CAP_INTEGERS:
@@ -767,10 +767,18 @@ static int si_get_shader_param(struct pipe_screen* pscreen,
                return 1;
 
        case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-               /* TODO: Indirection of geometry shader input dimension is not
-                * handled yet
-                */
-               return shader != PIPE_SHADER_GEOMETRY;
+               /* TODO: Indirect indexing of GS inputs is unimplemented. */
+               return shader != PIPE_SHADER_GEOMETRY &&
+                      (sscreen->llvm_has_working_vgpr_indexing ||
+                       /* TCS and TES load inputs directly from LDS or
+                        * offchip memory, so indirect indexing is trivial. */
+                       shader == PIPE_SHADER_TESS_CTRL ||
+                       shader == PIPE_SHADER_TESS_EVAL);
+
+       case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+               return sscreen->llvm_has_working_vgpr_indexing ||
+                      /* TCS stores outputs directly to memory. */
+                      shader == PIPE_SHADER_TESS_CTRL;
 
        /* Unsupported boolean features. */
        case PIPE_SHADER_CAP_SUBROUTINES:
@@ -1006,6 +1014,11 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
                                            sscreen->b.family <= CHIP_POLARIS12) ||
                                           sscreen->b.family == CHIP_VEGA10 ||
                                           sscreen->b.family == CHIP_RAVEN;
+       /* While it would be nice not to have this flag, we are constrained
+        * by the reality that LLVM 5.0 doesn't have working VGPR indexing
+        * on GFX9.
+        */
+       sscreen->llvm_has_working_vgpr_indexing = sscreen->b.chip_class <= VI;
 
        sscreen->b.has_cp_dma = true;
        sscreen->b.has_streamout = true;
index bd724e80a06608d84b4ff9046a1c0257bf592000..c028aba308153039de8c328d67675aeb3ea3ad44 100644 (file)
@@ -83,6 +83,7 @@ struct si_screen {
        bool                            has_draw_indirect_multi;
        bool                            has_ds_bpermute;
        bool                            has_msaa_sample_loc_bug;
+       bool                            llvm_has_working_vgpr_indexing;
 
        /* Whether shaders are monolithic (1-part) or separate (3-part). */
        bool                            use_monolithic_shaders;
index b37d4b232b15ffef73ce6a30bba034d5780a2b2d..9c4a2343f544b1c388335d807afbc2f8f820a448 100644 (file)
@@ -755,8 +755,7 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
                         * promote allocas into registers when profitable.
                         */
                        if (array_size > 16 ||
-                           /* TODO: VGPR indexing is buggy on GFX9. */
-                           ctx->screen->b.chip_class == GFX9) {
+                           !ctx->screen->llvm_has_working_vgpr_indexing) {
                                array_alloca = LLVMBuildAlloca(builder,
                                        LLVMArrayType(ctx->f32,
                                                      array_size), "array");