gallium: remove pipe_index_buffer and set_index_buffer

[mesa.git] / src / gallium / drivers / radeonsi / si_state_draw.c
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c

index 7bf4f4ddb87d31e17a052db7ff669907e496efc4..1de523c393f2bece3bbd1fb61697b9ee403e136e 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -96,7 +96,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
                                        unsigned *num_patches)
  {
         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-       struct si_shader_ctx_state *ls = &sctx->vs_shader;
+       struct si_shader *ls_current;
+       struct si_shader_selector *ls;
         /* The TES pointer will only be used for sctx->last_tcs.
          * It would be wrong to think that TCS = TES. */
         struct si_shader_selector *tcs =
@@ -107,11 +108,24 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
         unsigned num_tcs_patch_outputs;
         unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
         unsigned input_patch_size, output_patch_size, output_patch0_offset;
-       unsigned perpatch_output_offset, lds_size, ls_rsrc2;
+       unsigned perpatch_output_offset, lds_size;
         unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
         unsigned offchip_layout, hardware_lds_size, ls_hs_config;
  
-       if (sctx->last_ls == ls->current &&
+       /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
+       if (sctx->b.chip_class >= GFX9) {
+               if (sctx->tcs_shader.cso)
+                       ls_current = sctx->tcs_shader.current;
+               else
+                       ls_current = sctx->fixed_func_tcs_shader.current;
+
+               ls = ls_current->key.part.tcs.ls;
+       } else {
+               ls_current = sctx->vs_shader.current;
+               ls = sctx->vs_shader.cso;
+       }
+
+       if (sctx->last_ls == ls_current &&
             sctx->last_tcs == tcs &&
             sctx->last_tes_sh_base == tes_sh_base &&
             sctx->last_num_tcs_input_cp == num_tcs_input_cp) {
@@ -119,14 +133,14 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
                 return;
         }
  
-       sctx->last_ls = ls->current;
+       sctx->last_ls = ls_current;
         sctx->last_tcs = tcs;
         sctx->last_tes_sh_base = tes_sh_base;
         sctx->last_num_tcs_input_cp = num_tcs_input_cp;
  
         /* This calculates how shader inputs and outputs among VS, TCS, and TES
          * are laid out in LDS. */
-       num_tcs_inputs = util_last_bit64(ls->cso->outputs_written);
+       num_tcs_inputs = util_last_bit64(ls->outputs_written);
  
         if (sctx->tcs_shader.cso) {
                 num_tcs_outputs = util_last_bit64(tcs->outputs_written);
@@ -170,10 +184,26 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
          */
         *num_patches = MIN2(*num_patches, 40);
  
-       /* SI bug workaround - limit LS-HS threadgroups to only one wave. */
         if (sctx->b.chip_class == SI) {
+               /* SI bug workaround, related to power management. Limit LS-HS
+                * threadgroups to only one wave.
+                */
                 unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
                 *num_patches = MIN2(*num_patches, one_wave);
+
+               if (sctx->screen->b.info.max_se == 1) {
+                       /* The VGT HS block increments the patch ID unconditionally
+                        * within a single threadgroup. This results in incorrect
+                        * patch IDs when instanced draws are used.
+                        *
+                        * The intended solution is to restrict threadgroups to
+                        * a single instance by setting SWITCH_ON_EOI, which
+                        * should cause IA to split instances up. However, this
+                        * doesn't work correctly on SI when there is no other
+                        * SE to switch to.
+                        */
+                       *num_patches = 1;
+               }
         }
  
         sctx->last_num_patches = *num_patches;
@@ -181,27 +211,6 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
         output_patch0_offset = input_patch_size * *num_patches;
         perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
  
-       lds_size = output_patch0_offset + output_patch_size * *num_patches;
-       ls_rsrc2 = ls->current->config.rsrc2;
-
-       if (sctx->b.chip_class >= CIK) {
-               assert(lds_size <= 65536);
-               lds_size = align(lds_size, 512) / 512;
-       } else {
-               assert(lds_size <= 32768);
-               lds_size = align(lds_size, 256) / 256;
-       }
-       si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
-       ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
-
-       /* Due to a hw bug, RSRC2_LS must be written twice with another
-        * LS register written in between. */
-       if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
-               radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
-       radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-       radeon_emit(cs, ls->current->config.rsrc1);
-       radeon_emit(cs, ls_rsrc2);
-
         /* Compute userdata SGPRs. */
         assert(((input_vertex_size / 4) & ~0xff) == 0);
         assert(((output_vertex_size / 4) & ~0xff) == 0);
@@ -218,25 +227,66 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
                          ((output_vertex_size / 4) << 13);
         tcs_out_offsets = (output_patch0_offset / 16) |
                           ((perpatch_output_offset / 16) << 16);
-       offchip_layout = (pervertex_output_patch_size * *num_patches << 16) |
-                        (num_tcs_output_cp << 9) | *num_patches;
+       offchip_layout = *num_patches |
+                        (num_tcs_output_cp << 6) |
+                        (pervertex_output_patch_size * *num_patches << 12);
  
-       /* Set them for LS. */
-       radeon_set_sh_reg(cs,
-               R_00B530_SPI_SHADER_USER_DATA_LS_0 + SI_SGPR_LS_OUT_LAYOUT * 4,
-               tcs_in_layout);
+       /* Compute the LDS size. */
+       lds_size = output_patch0_offset + output_patch_size * *num_patches;
  
-       /* Set them for TCS. */
-       radeon_set_sh_reg_seq(cs,
-               R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
-       radeon_emit(cs, offchip_layout);
-       radeon_emit(cs, tcs_out_offsets);
-       radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
-       radeon_emit(cs, tcs_in_layout);
+       if (sctx->b.chip_class >= CIK) {
+               assert(lds_size <= 65536);
+               lds_size = align(lds_size, 512) / 512;
+       } else {
+               assert(lds_size <= 32768);
+               lds_size = align(lds_size, 256) / 256;
+       }
+
+       /* Set SI_SGPR_VS_STATE_BITS. */
+       sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE &
+                                 C_VS_STATE_LS_OUT_VERTEX_SIZE;
+       sctx->current_vs_state |= tcs_in_layout;
+
+       if (sctx->b.chip_class >= GFX9) {
+               unsigned hs_rsrc2 = ls_current->config.rsrc2 |
+                                   S_00B42C_LDS_SIZE(lds_size);
+
+               radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
+
+               /* Set userdata SGPRs for merged LS-HS. */
+               radeon_set_sh_reg_seq(cs,
+                                     R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+                                     GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
+               radeon_emit(cs, offchip_layout);
+               radeon_emit(cs, tcs_out_offsets);
+               radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+       } else {
+               unsigned ls_rsrc2 = ls_current->config.rsrc2;
+
+               si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
+               ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
+
+               /* Due to a hw bug, RSRC2_LS must be written twice with another
+                * LS register written in between. */
+               if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
+                       radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+               radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+               radeon_emit(cs, ls_current->config.rsrc1);
+               radeon_emit(cs, ls_rsrc2);
+
+               /* Set userdata SGPRs for TCS. */
+               radeon_set_sh_reg_seq(cs,
+                       R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+               radeon_emit(cs, offchip_layout);
+               radeon_emit(cs, tcs_out_offsets);
+               radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+               radeon_emit(cs, tcs_in_layout);
+       }
  
-       /* Set them for TES. */
-       radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OFFCHIP_LAYOUT * 4, 1);
+       /* Set userdata SGPRs for TES. */
+       radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
         radeon_emit(cs, offchip_layout);
+       radeon_emit(cs, r600_resource(sctx->tess_offchip_ring)->gpu_address >> 16);
  
         ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) |
                        S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
@@ -291,13 +341,15 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
                 /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
                 if (sscreen->has_distributed_tess) {
                         if (key->u.uses_gs) {
-                               partial_es_wave = true;
+                               if (sscreen->b.chip_class <= VI)
+                                       partial_es_wave = true;
  
                                 /* GPU hang workaround. */
                                 if (sscreen->b.family == CHIP_TONGA ||
                                     sscreen->b.family == CHIP_FIJI ||
                                     sscreen->b.family == CHIP_POLARIS10 ||
-                                   sscreen->b.family == CHIP_POLARIS11)
+                                   sscreen->b.family == CHIP_POLARIS11 ||
+                                   sscreen->b.family == CHIP_POLARIS12)
                                         partial_vs_wave = true;
                         } else {
                                 partial_vs_wave = true;
@@ -371,7 +423,7 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
         }
  
         /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
-       if (ia_switch_on_eoi)
+       if (sscreen->b.chip_class <= VI && ia_switch_on_eoi)
                 partial_es_wave = true;
  
         return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
@@ -379,7 +431,8 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
                 S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
                 S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
                 S_028AA8_WD_SWITCH_ON_EOP(sscreen->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
-               S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->b.chip_class >= VI ?
+               /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
+               S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->b.chip_class == VI ?
                                              max_primgroup_in_wave : 0) |
                 S_030960_EN_INST_OPT_BASIC(sscreen->b.chip_class >= GFX9) |
                 S_030960_EN_INST_OPT_ADV(sscreen->b.chip_class >= GFX9);
@@ -445,7 +498,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
  
         if (sctx->gs_shader.cso) {
                 /* GS requirement. */
-               if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+               if (sctx->b.chip_class <= VI &&
+                   SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
                         ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
  
                 /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
@@ -494,13 +548,18 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
         sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
  }
  
-static void si_emit_vs_state(struct si_context *sctx)
+static void si_emit_vs_state(struct si_context *sctx,
+                            const struct pipe_draw_info *info)
  {
+       sctx->current_vs_state &= C_VS_STATE_INDEXED;
+       sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
+
         if (sctx->current_vs_state != sctx->last_vs_state) {
                 struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
  
                 radeon_set_sh_reg(cs,
-                       R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
+                       sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] +
+                       SI_SGPR_VS_STATE_BITS * 4,
                         sctx->current_vs_state);
  
                 sctx->last_vs_state = sctx->current_vs_state;
@@ -566,8 +625,11 @@ static void si_emit_draw_registers(struct si_context *sctx,
  
  static void si_emit_draw_packets(struct si_context *sctx,
                                  const struct pipe_draw_info *info,
-                                const struct pipe_index_buffer *ib)
+                                struct pipe_resource *indexbuf,
+                                unsigned index_size,
+                                unsigned index_offset)
  {
+       struct pipe_draw_indirect_info *indirect = info->indirect;
         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
         unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
         bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
@@ -598,12 +660,12 @@ static void si_emit_draw_packets(struct si_context *sctx,
         }
  
         /* draw packet */
-       if (info->indexed) {
-               if (ib->index_size != sctx->last_index_size) {
+       if (index_size) {
+               if (index_size != sctx->last_index_size) {
                         unsigned index_type;
  
                         /* index type */
-                       switch (ib->index_size) {
+                       switch (index_size) {
                         case 1:
                                 index_type = V_028A7C_VGT_INDEX_8;
                                 break;
@@ -630,15 +692,15 @@ static void si_emit_draw_packets(struct si_context *sctx,
                                 radeon_emit(cs, index_type);
                         }
  
-                       sctx->last_index_size = ib->index_size;
+                       sctx->last_index_size = index_size;
                 }
  
-               index_max_size = (ib->buffer->width0 - ib->offset) /
-                                 ib->index_size;
-               index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
+               index_max_size = (indexbuf->width0 - index_offset) /
+                                 index_size;
+               index_va = r600_resource(indexbuf)->gpu_address + index_offset;
  
                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-                                     (struct r600_resource *)ib->buffer,
+                                     (struct r600_resource *)indexbuf,
                                       RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
         } else {
                 /* On CI and later, non-indexed draws overwrite VGT_INDEX_TYPE,
@@ -648,32 +710,8 @@ static void si_emit_draw_packets(struct si_context *sctx,
                         sctx->last_index_size = -1;
         }
  
-       if (!info->indirect) {
-               int base_vertex;
-
-               radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
-               radeon_emit(cs, info->instance_count);
-
-               /* Base vertex and start instance. */
-               base_vertex = info->indexed ? info->index_bias : info->start;
-
-               if (base_vertex != sctx->last_base_vertex ||
-                   sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
-                   info->start_instance != sctx->last_start_instance ||
-                   info->drawid != sctx->last_drawid ||
-                   sh_base_reg != sctx->last_sh_base_reg) {
-                       radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
-                       radeon_emit(cs, base_vertex);
-                       radeon_emit(cs, info->start_instance);
-                       radeon_emit(cs, info->drawid);
-
-                       sctx->last_base_vertex = base_vertex;
-                       sctx->last_start_instance = info->start_instance;
-                       sctx->last_drawid = info->drawid;
-                       sctx->last_sh_base_reg = sh_base_reg;
-               }
-       } else {
-               uint64_t indirect_va = r600_resource(info->indirect)->gpu_address;
+       if (indirect) {
+               uint64_t indirect_va = r600_resource(indirect->buffer)->gpu_address;
  
                 assert(indirect_va % 8 == 0);
  
@@ -685,17 +723,15 @@ static void si_emit_draw_packets(struct si_context *sctx,
                 radeon_emit(cs, indirect_va >> 32);
  
                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-                                     (struct r600_resource *)info->indirect,
+                                     (struct r600_resource *)indirect->buffer,
                                       RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-       }
  
-       if (info->indirect) {
-               unsigned di_src_sel = info->indexed ? V_0287F0_DI_SRC_SEL_DMA
+               unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA
                                                     : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
  
-               assert(info->indirect_offset % 4 == 0);
+               assert(indirect->offset % 4 == 0);
  
-               if (info->indexed) {
+               if (index_size) {
                         radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
                         radeon_emit(cs, index_va);
                         radeon_emit(cs, index_va >> 32);
@@ -705,45 +741,69 @@ static void si_emit_draw_packets(struct si_context *sctx,
                 }
  
                 if (!sctx->screen->has_draw_indirect_multi) {
-                       radeon_emit(cs, PKT3(info->indexed ? PKT3_DRAW_INDEX_INDIRECT
+                       radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT
                                                            : PKT3_DRAW_INDIRECT,
                                              3, render_cond_bit));
-                       radeon_emit(cs, info->indirect_offset);
+                       radeon_emit(cs, indirect->offset);
                         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
                         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
                         radeon_emit(cs, di_src_sel);
                 } else {
                         uint64_t count_va = 0;
  
-                       if (info->indirect_params) {
+                       if (indirect->indirect_draw_count) {
                                 struct r600_resource *params_buf =
-                                       (struct r600_resource *)info->indirect_params;
+                                       (struct r600_resource *)indirect->indirect_draw_count;
  
                                 radeon_add_to_buffer_list(
                                         &sctx->b, &sctx->b.gfx, params_buf,
                                         RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
  
-                               count_va = params_buf->gpu_address + info->indirect_params_offset;
+                               count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
                         }
  
-                       radeon_emit(cs, PKT3(info->indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
+                       radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
                                                              PKT3_DRAW_INDIRECT_MULTI,
                                              8, render_cond_bit));
-                       radeon_emit(cs, info->indirect_offset);
+                       radeon_emit(cs, indirect->offset);
                         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
                         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
                         radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
                                         S_2C3_DRAW_INDEX_ENABLE(1) |
-                                       S_2C3_COUNT_INDIRECT_ENABLE(!!info->indirect_params));
-                       radeon_emit(cs, info->indirect_count);
+                                       S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
+                       radeon_emit(cs, indirect->draw_count);
                         radeon_emit(cs, count_va);
                         radeon_emit(cs, count_va >> 32);
-                       radeon_emit(cs, info->indirect_stride);
+                       radeon_emit(cs, indirect->stride);
                         radeon_emit(cs, di_src_sel);
                 }
         } else {
-               if (info->indexed) {
-                       index_va += info->start * ib->index_size;
+               int base_vertex;
+
+               radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
+               radeon_emit(cs, info->instance_count);
+
+               /* Base vertex and start instance. */
+               base_vertex = index_size ? info->index_bias : info->start;
+
+               if (base_vertex != sctx->last_base_vertex ||
+                   sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
+                   info->start_instance != sctx->last_start_instance ||
+                   info->drawid != sctx->last_drawid ||
+                   sh_base_reg != sctx->last_sh_base_reg) {
+                       radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
+                       radeon_emit(cs, base_vertex);
+                       radeon_emit(cs, info->start_instance);
+                       radeon_emit(cs, info->drawid);
+
+                       sctx->last_base_vertex = base_vertex;
+                       sctx->last_start_instance = info->start_instance;
+                       sctx->last_drawid = info->drawid;
+                       sctx->last_sh_base_reg = sh_base_reg;
+               }
+
+               if (index_size) {
+                       index_va += info->start * index_size;
  
                         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
                         radeon_emit(cs, index_max_size);
@@ -1015,17 +1075,19 @@ static void si_get_draw_start_count(struct si_context *sctx,
                                     const struct pipe_draw_info *info,
                                     unsigned *start, unsigned *count)
  {
-       if (info->indirect) {
+       struct pipe_draw_indirect_info *indirect = info->indirect;
+
+       if (indirect) {
                 unsigned indirect_count;
                 struct pipe_transfer *transfer;
                 unsigned begin, end;
                 unsigned map_size;
                 unsigned *data;
  
-               if (info->indirect_params) {
+               if (indirect->indirect_draw_count) {
                         data = pipe_buffer_map_range(&sctx->b.b,
-                                       info->indirect_params,
-                                       info->indirect_params_offset,
+                                       indirect->indirect_draw_count,
+                                       indirect->indirect_draw_count_offset,
                                         sizeof(unsigned),
                                         PIPE_TRANSFER_READ, &transfer);
  
@@ -1033,7 +1095,7 @@ static void si_get_draw_start_count(struct si_context *sctx,
  
                         pipe_buffer_unmap(&sctx->b.b, transfer);
                 } else {
-                       indirect_count = info->indirect_count;
+                       indirect_count = indirect->draw_count;
                 }
  
                 if (!indirect_count) {
@@ -1041,9 +1103,9 @@ static void si_get_draw_start_count(struct si_context *sctx,
                         return;
                 }
  
-               map_size = (indirect_count - 1) * info->indirect_stride + 3 * sizeof(unsigned);
-               data = pipe_buffer_map_range(&sctx->b.b, info->indirect,
-                                            info->indirect_offset, map_size,
+               map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
+               data = pipe_buffer_map_range(&sctx->b.b, indirect->buffer,
+                                            indirect->offset, map_size,
                                              PIPE_TRANSFER_READ, &transfer);
  
                 begin = UINT_MAX;
@@ -1058,7 +1120,7 @@ static void si_get_draw_start_count(struct si_context *sctx,
                                 end = MAX2(end, start + count);
                         }
  
-                       data += info->indirect_stride / sizeof(unsigned);
+                       data += indirect->stride / sizeof(unsigned);
                 }
  
                 pipe_buffer_unmap(&sctx->b.b, transfer);
@@ -1100,11 +1162,12 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
  {
         struct si_context *sctx = (struct si_context *)ctx;
         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       const struct pipe_index_buffer *ib = &sctx->index_buffer;
-       struct pipe_index_buffer ib_tmp; /* for index buffer uploads only */
+       struct pipe_resource *indexbuf = info->index.resource;
         unsigned mask, dirty_tex_counter;
         enum pipe_prim_type rast_prim;
         unsigned num_patches = 0;
+       unsigned index_size = info->index_size;
+       unsigned index_offset = info->indirect ? info->start * index_size : 0;
  
         if (likely(!info->indirect)) {
                 /* SI-CI treat instance_count==0 as instance_count==1. There is
@@ -1116,7 +1179,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
  
                 /* Handle count == 0. */
                 if (unlikely(!info->count &&
-                            (info->indexed || !info->count_from_stream_output)))
+                            (index_size || !info->count_from_stream_output)))
                         return;
         }
  
@@ -1188,74 +1251,73 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
         if (!si_upload_graphics_shader_descriptors(sctx))
                 return;
  
-       ib_tmp.buffer = NULL;
-
-       if (info->indexed) {
+       if (index_size) {
                 /* Translate or upload, if needed. */
                 /* 8-bit indices are supported on VI. */
-               if (sctx->b.chip_class <= CIK && ib->index_size == 1) {
-                       unsigned start, count, start_offset, size;
+               if (sctx->b.chip_class <= CIK && index_size == 1) {
+                       unsigned start, count, start_offset, size, offset;
                         void *ptr;
  
                         si_get_draw_start_count(sctx, info, &start, &count);
                         start_offset = start * 2;
                         size = count * 2;
  
+                       indexbuf = NULL;
                         u_upload_alloc(ctx->stream_uploader, start_offset,
                                        size,
                                        si_optimal_tcc_alignment(sctx, size),
-                                      &ib_tmp.offset, &ib_tmp.buffer, &ptr);
-                       if (!ib_tmp.buffer)
+                                      &offset, &indexbuf, &ptr);
+                       if (!indexbuf)
                                 return;
  
-                       util_shorten_ubyte_elts_to_userptr(&sctx->b.b, ib, 0, 0,
-                                                          ib->offset + start,
+                       util_shorten_ubyte_elts_to_userptr(&sctx->b.b, info, 0, 0,
+                                                          index_offset + start,
                                                            count, ptr);
  
                         /* info->start will be added by the drawing code */
-                       ib_tmp.offset -= start_offset;
-                       ib_tmp.index_size = 2;
-                       ib = &ib_tmp;
-               } else if (ib->user_buffer && !ib->buffer) {
-                       unsigned start, count, start_offset;
+                       index_offset = offset - start_offset;
+                       index_size = 2;
+               } else if (info->has_user_indices) {
+                       unsigned start_offset;
  
-                       si_get_draw_start_count(sctx, info, &start, &count);
-                       start_offset = start * ib->index_size;
+                       assert(!info->indirect);
+                       start_offset = info->start * index_size;
  
+                       indexbuf = NULL;
                         u_upload_data(ctx->stream_uploader, start_offset,
-                                     count * ib->index_size,
+                                     info->count * index_size,
                                       sctx->screen->b.info.tcc_cache_line_size,
-                                     (char*)ib->user_buffer + start_offset,
-                                     &ib_tmp.offset, &ib_tmp.buffer);
-                       if (!ib_tmp.buffer)
+                                     (char*)info->index.user + start_offset,
+                                     &index_offset, &indexbuf);
+                       if (!indexbuf)
                                 return;
  
                         /* info->start will be added by the drawing code */
-                       ib_tmp.offset -= start_offset;
-                       ib_tmp.index_size = ib->index_size;
-                       ib = &ib_tmp;
+                       index_offset -= start_offset;
                 } else if (sctx->b.chip_class <= CIK &&
-                          r600_resource(ib->buffer)->TC_L2_dirty) {
+                          r600_resource(indexbuf)->TC_L2_dirty) {
                         /* VI reads index buffers through TC L2, so it doesn't
                          * need this. */
                         sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-                       r600_resource(ib->buffer)->TC_L2_dirty = false;
+                       r600_resource(indexbuf)->TC_L2_dirty = false;
                 }
         }
  
         if (info->indirect) {
+               struct pipe_draw_indirect_info *indirect = info->indirect;
+
                 /* Add the buffer size for memory checking in need_cs_space. */
-               r600_context_add_resource_size(ctx, info->indirect);
+               r600_context_add_resource_size(ctx, indirect->buffer);
  
-               if (r600_resource(info->indirect)->TC_L2_dirty) {
+               if (r600_resource(indirect->buffer)->TC_L2_dirty) {
                         sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-                       r600_resource(info->indirect)->TC_L2_dirty = false;
+                       r600_resource(indirect->buffer)->TC_L2_dirty = false;
                 }
  
-               if (info->indirect_params &&
-                   r600_resource(info->indirect_params)->TC_L2_dirty) {
+               if (indirect->indirect_draw_count &&
+                   r600_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
                         sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-                       r600_resource(info->indirect_params)->TC_L2_dirty = false;
+                       r600_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
                 }
         }
  
@@ -1304,11 +1366,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
         si_emit_rasterizer_prim_state(sctx);
         if (sctx->tes_shader.cso)
                 si_emit_derived_tess_state(sctx, info, &num_patches);
-       si_emit_vs_state(sctx);
+       si_emit_vs_state(sctx, info);
         si_emit_draw_registers(sctx, info, num_patches);
  
         si_ce_pre_draw_synchronization(sctx);
-       si_emit_draw_packets(sctx, info, ib);
+       si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
         si_ce_post_draw_synchronization(sctx);
  
         if (sctx->trace_buf)
@@ -1354,10 +1416,13 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                 sctx->framebuffer.do_update_surf_dirtiness = false;
         }
  
-       pipe_resource_reference(&ib_tmp.buffer, NULL);
         sctx->b.num_draw_calls++;
+       if (info->primitive_restart)
+               sctx->b.num_prim_restart_calls++;
         if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
                 sctx->b.num_spill_draw_calls++;
+       if (index_size && indexbuf != info->index.resource)
+               pipe_resource_reference(&indexbuf, NULL);
  }
  
  void si_trace_emit(struct si_context *sctx)