radeonsi: silence a Coverity warning
[mesa.git] / src / gallium / drivers / radeonsi / si_state_shaders.c
index 602bbfb705670e1d753184201e45b8f26490d096..313af85a1cb8bd81933695b5992f732d662daad9 100644 (file)
@@ -458,8 +458,10 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
        si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
        /* We need at least 2 components for LS.
-        * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
-       vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
+        * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+        * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+        */
+       vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
 
        si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
        si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
@@ -491,13 +493,10 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
                si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, va >> 40);
 
                /* We need at least 2 components for LS.
-                * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
-               ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
-
-               if (shader->config.scratch_bytes_per_wave) {
-                       fprintf(stderr, "HS: scratch buffer unsupported");
-                       abort();
-               }
+                * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+                * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+                */
+               ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
 
                shader->config.rsrc2 =
                        S_00B42C_USER_SGPR(GFX9_TCS_NUM_USER_SGPR) |
@@ -544,10 +543,11 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
        si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
        if (shader->selector->type == PIPE_SHADER_VERTEX) {
-               vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
+               /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+               vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
                num_user_sgprs = SI_VS_NUM_USER_SGPR;
        } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-               vgpr_comp_cnt = 3; /* all components are needed for TES */
+               vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
                num_user_sgprs = SI_TES_NUM_USER_SGPR;
        } else
                unreachable("invalid shader selector type");
@@ -759,9 +759,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
                struct gfx9_gs_info gs_info;
 
                if (es_type == PIPE_SHADER_VERTEX)
-                       es_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
+                       /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+                       es_vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
                else if (es_type == PIPE_SHADER_TESS_EVAL)
-                       es_vgpr_comp_cnt = 3; /* all components are needed for TES */
+                       es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
                else
                        unreachable("invalid shader selector type");
 
@@ -810,11 +811,6 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
                polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
                                             NULL, pm4);
-
-               if (shader->config.scratch_bytes_per_wave) {
-                       fprintf(stderr, "GS: scratch buffer unsupported");
-                       abort();
-               }
        } else {
                si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
                si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
@@ -847,7 +843,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
        unsigned oc_lds_en;
        unsigned window_space =
           shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-       bool enable_prim_id = si_vs_exports_prim_id(shader);
+       bool enable_prim_id = shader->key.mono.vs_export_prim_id || shader->selector->info.uses_primid;
 
        pm4 = si_get_shader_pm4_state(shader);
        if (!pm4)
@@ -861,8 +857,17 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
         * not sent again.
         */
        if (!gs) {
-               si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
-                              S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
+               unsigned mode = 0;
+
+               /* PrimID needs GS scenario A.
+                * GFX9 also needs it when ViewportIndex is enabled.
+                */
+               if (enable_prim_id ||
+                   (sscreen->b.chip_class >= GFX9 &&
+                    shader->selector->info.writes_viewport_index))
+                       mode = V_028A40_GS_SCENARIO_A;
+
+               si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, S_028A40_MODE(mode));
                si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
        } else {
                si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(gs));
@@ -876,10 +881,14 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
                num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
        } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-               vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
+               /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
+                * If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
+                * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+                */
+               vgpr_comp_cnt = enable_prim_id ? 2 : (shader->info.uses_instanceid ? 1 : 0);
                num_user_sgprs = SI_VS_NUM_USER_SGPR;
        } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-               vgpr_comp_cnt = 3; /* all components are needed for TES */
+               vgpr_comp_cnt = enable_prim_id ? 3 : 2;
                num_user_sgprs = SI_TES_NUM_USER_SGPR;
        } else
                unreachable("invalid shader selector type");
@@ -1222,22 +1231,17 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx,
 
        /* Find out which VS outputs aren't used by the PS. */
        uint64_t outputs_written = vs->outputs_written;
-       uint32_t outputs_written2 = vs->outputs_written2;
        uint64_t inputs_read = 0;
-       uint32_t inputs_read2 = 0;
 
        outputs_written &= ~0x3; /* ignore POSITION, PSIZE */
 
        if (!ps_disabled) {
                inputs_read = ps->inputs_read;
-               inputs_read2 = ps->inputs_read2;
        }
 
        uint64_t linked = outputs_written & inputs_read;
-       uint32_t linked2 = outputs_written2 & inputs_read2;
 
        key->opt.hw_vs.kill_outputs = ~linked & outputs_written;
-       key->opt.hw_vs.kill_outputs2 = ~linked2 & outputs_written2;
 }
 
 /* Compute the key for the hw shader variant */
@@ -1261,7 +1265,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                        si_shader_selector_key_hw_vs(sctx, sel, key);
 
                        if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-                               key->part.vs.epilog.export_prim_id = 1;
+                               key->mono.vs_export_prim_id = 1;
                }
                break;
        case PIPE_SHADER_TESS_CTRL:
@@ -1286,7 +1290,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                        si_shader_selector_key_hw_vs(sctx, sel, key);
 
                        if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-                               key->part.tes.epilog.export_prim_id = 1;
+                               key->mono.vs_export_prim_id = 1;
                }
                break;
        case PIPE_SHADER_GEOMETRY:
@@ -1504,6 +1508,19 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
        return true;
 }
 
+static void si_destroy_shader_selector(struct si_context *sctx,
+                                      struct si_shader_selector *sel);
+
+static void si_shader_selector_reference(struct si_context *sctx,
+                                        struct si_shader_selector **dst,
+                                        struct si_shader_selector *src)
+{
+       if (pipe_reference(&(*dst)->reference, &src->reference))
+               si_destroy_shader_selector(sctx, *dst);
+
+       *dst = src;
+}
+
 /* Select the hw shader variant depending on the current state. */
 static int si_shader_select_with_key(struct si_screen *sscreen,
                                     struct si_shader_ctx_state *state,
@@ -1512,6 +1529,7 @@ static int si_shader_select_with_key(struct si_screen *sscreen,
                                     int thread_index)
 {
        struct si_shader_selector *sel = state->cso;
+       struct si_shader_selector *previous_stage_sel = NULL;
        struct si_shader *current = state->current;
        struct si_shader *iter, *shader = NULL;
 
@@ -1579,6 +1597,14 @@ again:
        shader->key = *key;
        shader->compiler_ctx_state = *compiler_state;
 
+       /* If this is a merged shader, get the first shader's selector. */
+       if (sscreen->b.chip_class >= GFX9) {
+               if (sel->type == PIPE_SHADER_TESS_CTRL)
+                       previous_stage_sel = key->part.tcs.ls;
+               else if (sel->type == PIPE_SHADER_GEOMETRY)
+                       previous_stage_sel = key->part.gs.es;
+       }
+
        /* Compile the main shader part if it doesn't exist. This can happen
         * if the initial guess was wrong. */
        bool is_pure_monolithic =
@@ -1595,22 +1621,18 @@ again:
                 * For merged shaders, check that the starting shader's main
                 * part is present.
                 */
-               if (sscreen->b.chip_class >= GFX9 &&
-                   (sel->type == PIPE_SHADER_TESS_CTRL ||
-                    sel->type == PIPE_SHADER_GEOMETRY)) {
-                       struct si_shader_selector *shader1 = NULL;
+               if (previous_stage_sel) {
                        struct si_shader_key shader1_key = zeroed;
 
-                       if (sel->type == PIPE_SHADER_TESS_CTRL) {
-                               shader1 = key->part.tcs.ls;
+                       if (sel->type == PIPE_SHADER_TESS_CTRL)
                                shader1_key.as_ls = 1;
-                       } else if (sel->type == PIPE_SHADER_GEOMETRY) {
-                               shader1 = key->part.gs.es;
+                       else if (sel->type == PIPE_SHADER_GEOMETRY)
                                shader1_key.as_es = 1;
-                       else
+                       else
                                assert(0);
 
-                       ok = si_check_missing_main_part(sscreen, shader1,
+                       ok = si_check_missing_main_part(sscreen,
+                                                       previous_stage_sel,
                                                        compiler_state, &shader1_key);
                } else {
                        ok = si_check_missing_main_part(sscreen, sel,
@@ -1623,6 +1645,15 @@ again:
                }
        }
 
+       /* Keep the reference to the 1st shader of merged shaders, so that
+        * Gallium can't destroy it before we destroy the 2nd shader.
+        *
+        * Set sctx = NULL, because it's unused if we're not releasing
+        * the shader, and we don't have any sctx here.
+        */
+       si_shader_selector_reference(NULL, &shader->previous_stage_sel,
+                                    previous_stage_sel);
+
        /* Monolithic-only shaders don't make a distinction between optimized
         * and unoptimized. */
        shader->is_monolithic =
@@ -1807,10 +1838,10 @@ void si_init_shader_selector_async(void *job, int thread_index)
                                switch (name) {
                                case TGSI_SEMANTIC_GENERIC:
                                        /* don't process indices the function can't handle */
-                                       if (index >= 60)
+                                       if (index >= SI_MAX_IO_GENERIC)
                                                break;
                                        /* fall through */
-                               case TGSI_SEMANTIC_CLIPDIST:
+                               default:
                                        id = si_shader_io_get_unique_index(name, index);
                                        sel->outputs_written &= ~(1ull << id);
                                        break;
@@ -1819,9 +1850,6 @@ void si_init_shader_selector_async(void *job, int thread_index)
                                case TGSI_SEMANTIC_CLIPVERTEX:
                                case TGSI_SEMANTIC_EDGEFLAG:
                                        break;
-                               default:
-                                       id = si_shader_io_get_unique_index2(name, index);
-                                       sel->outputs_written2 &= ~(1u << id);
                                }
                        }
                }
@@ -1886,6 +1914,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
        if (!sel)
                return NULL;
 
+       pipe_reference_init(&sel->reference, 1);
        sel->screen = sscreen;
        sel->compiler_ctx_state.tm = sctx->tm;
        sel->compiler_ctx_state.debug = sctx->b.debug;
@@ -1944,8 +1973,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
        case PIPE_SHADER_TESS_CTRL:
                /* Always reserve space for these. */
                sel->patch_outputs_written |=
-                       (1llu << si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0)) |
-                       (1llu << si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0));
+                       (1llu << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
+                       (1llu << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
                /* fall through */
        case PIPE_SHADER_VERTEX:
        case PIPE_SHADER_TESS_EVAL:
@@ -1958,26 +1987,21 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                        case TGSI_SEMANTIC_TESSOUTER:
                        case TGSI_SEMANTIC_PATCH:
                                sel->patch_outputs_written |=
-                                       1llu << si_shader_io_get_unique_index(name, index);
+                                       1llu << si_shader_io_get_unique_index_patch(name, index);
                                break;
 
                        case TGSI_SEMANTIC_GENERIC:
                                /* don't process indices the function can't handle */
-                               if (index >= 60)
+                               if (index >= SI_MAX_IO_GENERIC)
                                        break;
                                /* fall through */
-                       case TGSI_SEMANTIC_POSITION:
-                       case TGSI_SEMANTIC_PSIZE:
-                       case TGSI_SEMANTIC_CLIPDIST:
+                       default:
                                sel->outputs_written |=
                                        1llu << si_shader_io_get_unique_index(name, index);
                                break;
                        case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */
                        case TGSI_SEMANTIC_EDGEFLAG:
                                break;
-                       default:
-                               sel->outputs_written2 |=
-                                       1u << si_shader_io_get_unique_index2(name, index);
                        }
                }
                sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
@@ -1995,16 +2019,17 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                        unsigned index = sel->info.input_semantic_index[i];
 
                        switch (name) {
-                       case TGSI_SEMANTIC_CLIPDIST:
                        case TGSI_SEMANTIC_GENERIC:
+                               /* don't process indices the function can't handle */
+                               if (index >= SI_MAX_IO_GENERIC)
+                                       break;
+                               /* fall through */
+                       default:
                                sel->inputs_read |=
                                        1llu << si_shader_io_get_unique_index(name, index);
                                break;
                        case TGSI_SEMANTIC_PCOORD: /* ignore this */
                                break;
-                       default:
-                               sel->inputs_read2 |=
-                                       1u << si_shader_io_get_unique_index2(name, index);
                        }
                }
 
@@ -2101,6 +2126,19 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
 }
 
+static void si_update_tess_uses_prim_id(struct si_context *sctx)
+{
+       sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
+               (sctx->tes_shader.cso &&
+                sctx->tes_shader.cso->info.uses_primid) ||
+               (sctx->tcs_shader.cso &&
+                sctx->tcs_shader.cso->info.uses_primid) ||
+               (sctx->gs_shader.cso &&
+                sctx->gs_shader.cso->info.uses_primid) ||
+               (sctx->ps_shader.cso && !sctx->gs_shader.cso &&
+                sctx->ps_shader.cso->info.uses_primid);
+}
+
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
@@ -2117,20 +2155,14 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
        si_mark_atom_dirty(sctx, &sctx->clip_regs);
        sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 
-       if (enable_changed)
+       if (enable_changed) {
                si_shader_change_notify(sctx);
+               if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+                       si_update_tess_uses_prim_id(sctx);
+       }
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
 }
 
-static void si_update_tcs_tes_uses_prim_id(struct si_context *sctx)
-{
-       sctx->ia_multi_vgt_param_key.u.tcs_tes_uses_prim_id =
-               (sctx->tes_shader.cso &&
-                sctx->tes_shader.cso->info.uses_primid) ||
-               (sctx->tcs_shader.cso &&
-                sctx->tcs_shader.cso->info.uses_primid);
-}
-
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
@@ -2142,7 +2174,7 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 
        sctx->tcs_shader.cso = sel;
        sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
-       si_update_tcs_tes_uses_prim_id(sctx);
+       si_update_tess_uses_prim_id(sctx);
        sctx->do_update_shaders = true;
 
        if (enable_changed)
@@ -2161,7 +2193,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
        sctx->tes_shader.cso = sel;
        sctx->tes_shader.current = sel ? sel->first_variant : NULL;
        sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
-       si_update_tcs_tes_uses_prim_id(sctx);
+       si_update_tess_uses_prim_id(sctx);
        sctx->do_update_shaders = true;
        si_mark_atom_dirty(sctx, &sctx->clip_regs);
        sctx->last_rast_prim = -1; /* reset this so that it gets updated */
@@ -2185,6 +2217,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
        sctx->ps_shader.cso = sel;
        sctx->ps_shader.current = sel ? sel->first_variant : NULL;
        sctx->do_update_shaders = true;
+       if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
+               si_update_tess_uses_prim_id(sctx);
        si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 }
 
@@ -2231,14 +2265,14 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
                }
        }
 
+       si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
        si_shader_destroy(shader);
        free(shader);
 }
 
-static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+static void si_destroy_shader_selector(struct si_context *sctx,
+                                      struct si_shader_selector *sel)
 {
-       struct si_context *sctx = (struct si_context *)ctx;
-       struct si_shader_selector *sel = (struct si_shader_selector *)state;
        struct si_shader *p = sel->first_variant, *c;
        struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
                [PIPE_SHADER_VERTEX] = &sctx->vs_shader,
@@ -2276,6 +2310,14 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
        free(sel);
 }
 
+static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+{
+       struct si_context *sctx = (struct si_context *)ctx;
+       struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+       si_shader_selector_reference(sctx, &sel, NULL);
+}
+
 static unsigned si_get_ps_input_cntl(struct si_context *sctx,
                                     struct si_shader *vs, unsigned name,
                                     unsigned index, unsigned interpolate)
@@ -2531,6 +2573,22 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
        return true;
 }
 
+static void si_shader_lock(struct si_shader *shader)
+{
+       mtx_lock(&shader->selector->mutex);
+       if (shader->previous_stage_sel) {
+               assert(shader->previous_stage_sel != shader->selector);
+               mtx_lock(&shader->previous_stage_sel->mutex);
+       }
+}
+
+static void si_shader_unlock(struct si_shader *shader)
+{
+       if (shader->previous_stage_sel)
+               mtx_unlock(&shader->previous_stage_sel->mutex);
+       mtx_unlock(&shader->selector->mutex);
+}
+
 /**
  * @returns 1 if \p sel has been updated to use a new scratch buffer
  *          0 if not
@@ -2549,25 +2607,40 @@ static int si_update_scratch_buffer(struct si_context *sctx,
        if (shader->config.scratch_bytes_per_wave == 0)
                return 0;
 
+       /* Prevent race conditions when updating:
+        * - si_shader::scratch_bo
+        * - si_shader::binary::code
+        * - si_shader::previous_stage::binary::code.
+        */
+       si_shader_lock(shader);
+
        /* This shader is already configured to use the current
         * scratch buffer. */
-       if (shader->scratch_bo == sctx->scratch_buffer)
+       if (shader->scratch_bo == sctx->scratch_buffer) {
+               si_shader_unlock(shader);
                return 0;
+       }
 
        assert(sctx->scratch_buffer);
 
-       si_shader_apply_scratch_relocs(sctx, shader, &shader->config, scratch_va);
+       if (shader->previous_stage)
+               si_shader_apply_scratch_relocs(shader->previous_stage, scratch_va);
+
+       si_shader_apply_scratch_relocs(shader, scratch_va);
 
        /* Replace the shader bo with a new bo that has the relocs applied. */
        r = si_shader_binary_upload(sctx->screen, shader);
-       if (r)
+       if (r) {
+               si_shader_unlock(shader);
                return r;
+       }
 
        /* Update the shader state to use the new shader bo. */
        si_shader_init_pm4_state(sctx->screen, shader);
 
        r600_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
 
+       si_shader_unlock(shader);
        return 1;
 }
 
@@ -2593,6 +2666,60 @@ static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
        return bytes;
 }
 
+static bool si_update_scratch_relocs(struct si_context *sctx)
+{
+       int r;
+
+       /* Update the shaders, so that they are using the latest scratch.
+        * The scratch buffer may have been changed since these shaders were
+        * last used, so we still need to try to update them, even if they
+        * require scratch buffers smaller than the current size.
+        */
+       r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1)
+               si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+       r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1)
+               si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+       r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1)
+               si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+
+       /* VS can be bound as LS, ES, or VS. */
+       r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1) {
+               if (sctx->tes_shader.current)
+                       si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+               else if (sctx->gs_shader.current)
+                       si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+               else
+                       si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+       }
+
+       /* TES can be bound as ES or VS. */
+       r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1) {
+               if (sctx->gs_shader.current)
+                       si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+               else
+                       si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+       }
+
+       return true;
+}
+
 static bool si_update_spi_tmpring_size(struct si_context *sctx)
 {
        unsigned current_scratch_buffer_size =
@@ -2602,7 +2729,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
        unsigned scratch_needed_size = scratch_bytes_per_wave *
                sctx->scratch_waves;
        unsigned spi_tmpring_size;
-       int r;
 
        if (scratch_needed_size > 0) {
                if (scratch_needed_size > current_scratch_buffer_size) {
@@ -2622,52 +2748,8 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
                                                       &sctx->scratch_buffer->b.b);
                }
 
-               /* Update the shaders, so they are using the latest scratch.  The
-                * scratch buffer may have been changed since these shaders were
-                * last used, so we still need to try to update them, even if
-                * they require scratch buffers smaller than the current size.
-                */
-               r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
-               if (r < 0)
+               if (!si_update_scratch_relocs(sctx))
                        return false;
-               if (r == 1)
-                       si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-               r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1)
-                       si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
-               r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1)
-                       si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
-
-               /* VS can be bound as LS, ES, or VS. */
-               r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1) {
-                       if (sctx->tes_shader.current)
-                               si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-                       else if (sctx->gs_shader.current)
-                               si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-                       else
-                               si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-               }
-
-               /* TES can be bound as ES or VS. */
-               r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1) {
-                       if (sctx->gs_shader.current)
-                               si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-                       else
-                               si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-               }
        }
 
        /* The LLVM shader backend should be reporting aligned scratch_sizes. */
@@ -2720,11 +2802,14 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
        }
 
        assert(!sctx->tf_ring);
+       /* Use 64K alignment for both rings, so that we can pass the address
+        * to shaders as one SGPR containing bits [16:47].
+        */
        sctx->tf_ring = r600_aligned_buffer_create(sctx->b.b.screen,
                                                   R600_RESOURCE_FLAG_UNMAPPABLE,
                                                   PIPE_USAGE_DEFAULT,
                                                   32768 * sctx->screen->b.info.max_se,
-                                                  256);
+                                                  64 * 1024);
        if (!sctx->tf_ring)
                return;
 
@@ -2736,12 +2821,22 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
                                           PIPE_USAGE_DEFAULT,
                                           max_offchip_buffers *
                                           sctx->screen->tess_offchip_block_dw_size * 4,
-                                          256);
+                                          64 * 1024);
        if (!sctx->tess_offchip_ring)
                return;
 
        si_init_config_add_vgt_flush(sctx);
 
+       uint64_t offchip_va = r600_resource(sctx->tess_offchip_ring)->gpu_address;
+       uint64_t factor_va = r600_resource(sctx->tf_ring)->gpu_address;
+       assert((offchip_va & 0xffff) == 0);
+       assert((factor_va & 0xffff) == 0);
+
+       si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tess_offchip_ring),
+                     RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+       si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tf_ring),
+                     RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+
        /* Append these registers to the init config state. */
        if (sctx->b.chip_class >= CIK) {
                if (sctx->b.chip_class >= VI)
@@ -2750,10 +2845,10 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
                si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
                               S_030938_SIZE(sctx->tf_ring->width0 / 4));
                si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
-                              r600_resource(sctx->tf_ring)->gpu_address >> 8);
+                              factor_va >> 8);
                if (sctx->b.chip_class >= GFX9)
                        si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
-                                      r600_resource(sctx->tf_ring)->gpu_address >> 40);
+                                      factor_va >> 40);
                si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
                             S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
                             S_03093C_OFFCHIP_GRANULARITY(offchip_granularity));
@@ -2762,24 +2857,37 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
                si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
                               S_008988_SIZE(sctx->tf_ring->width0 / 4));
                si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
-                              r600_resource(sctx->tf_ring)->gpu_address >> 8);
+                              factor_va >> 8);
                si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
                               S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers));
        }
 
+       if (sctx->b.chip_class >= GFX9) {
+               si_pm4_set_reg(sctx->init_config,
+                              R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+                              GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4,
+                              offchip_va >> 16);
+               si_pm4_set_reg(sctx->init_config,
+                              R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+                              GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K * 4,
+                              factor_va >> 16);
+       } else {
+               si_pm4_set_reg(sctx->init_config,
+                              R_00B430_SPI_SHADER_USER_DATA_HS_0 +
+                              GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4,
+                              offchip_va >> 16);
+               si_pm4_set_reg(sctx->init_config,
+                              R_00B430_SPI_SHADER_USER_DATA_HS_0 +
+                              GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K * 4,
+                              factor_va >> 16);
+       }
+
        /* Flush the context to re-emit the init_config state.
         * This is done only once in a lifetime of a context.
         */
        si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
        sctx->b.initial_gfx_cs_size = 0; /* force flush */
        si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
-
-       si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_FACTOR, sctx->tf_ring,
-                          0, sctx->tf_ring->width0, false, false, 0, 0, 0);
-
-       si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_OFFCHIP,
-                          sctx->tess_offchip_ring, 0,
-                          sctx->tess_offchip_ring->width0, false, false, 0, 0, 0);
 }
 
 /**