+static void si_pipe_shader_es(struct pipe_context *ctx, struct si_pipe_shader *shader)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_pm4_state *pm4;
+ unsigned num_sgprs, num_user_sgprs;
+ unsigned vgpr_comp_cnt;
+ uint64_t va;
+
+ si_pm4_delete_state(sctx, es, shader->pm4);
+ pm4 = shader->pm4 = si_pm4_alloc_state(sctx);
+
+ if (pm4 == NULL)
+ return;
+
+ va = r600_resource_va(ctx->screen, (void *)shader->bo);
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+ vgpr_comp_cnt = shader->shader.uses_instanceid ? 3 : 0;
+
+ num_user_sgprs = SI_VS_NUM_USER_SGPR;
+ num_sgprs = shader->num_sgprs;
+ /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
+ if ((num_user_sgprs + 1) > num_sgprs) {
+ /* Last 2 reserved SGPRs are used for VCC */
+ num_sgprs = num_user_sgprs + 1 + 2;
+ }
+ assert(num_sgprs <= 104);
+
+ si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
+ si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
+ S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B328_SGPRS((num_sgprs - 1) / 8) |
+ S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt));
+ si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
+ S_00B32C_USER_SGPR(num_user_sgprs));
+
+ sctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE;
+}
+
+static void si_pipe_shader_gs(struct pipe_context *ctx, struct si_pipe_shader *shader)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ unsigned gs_vert_itemsize = shader->shader.noutput * (16 >> 2);
+ unsigned gs_max_vert_out = shader->shader.gs_max_out_vertices;
+ unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+ unsigned cut_mode;
+ struct si_pm4_state *pm4;
+ unsigned num_sgprs, num_user_sgprs;
+ uint64_t va;
+
+ /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+ assert(gsvs_itemsize < (1 << 15));
+
+ si_pm4_delete_state(sctx, gs, shader->pm4);
+ pm4 = shader->pm4 = si_pm4_alloc_state(sctx);
+
+ if (pm4 == NULL)
+ return;
+
+ if (gs_max_vert_out <= 128) {
+ cut_mode = V_028A40_GS_CUT_128;
+ } else if (gs_max_vert_out <= 256) {
+ cut_mode = V_028A40_GS_CUT_256;
+ } else if (gs_max_vert_out <= 512) {
+ cut_mode = V_028A40_GS_CUT_512;
+ } else {
+ assert(gs_max_vert_out <= 1024);
+ cut_mode = V_028A40_GS_CUT_1024;
+ }
+
+ si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
+ S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
+ S_028A40_CUT_MODE(cut_mode)|
+ S_028A40_ES_WRITE_OPTIMIZE(1) |
+ S_028A40_GS_WRITE_OPTIMIZE(1));
+
+ si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
+ si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize);
+ si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
+
+ si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ shader->shader.nparam * (16 >> 2));
+ si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
+
+ si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
+
+ si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
+
+ va = r600_resource_va(ctx->screen, (void *)shader->bo);
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+ si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
+
+ num_user_sgprs = SI_GS_NUM_USER_SGPR;
+ num_sgprs = shader->num_sgprs;
+ /* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
+ if ((num_user_sgprs + 2) > num_sgprs) {
+ /* Last 2 reserved SGPRs are used for VCC */
+ num_sgprs = num_user_sgprs + 2 + 2;
+ }
+ assert(num_sgprs <= 104);
+
+ si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+ S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
+ S_00B228_SGPRS((num_sgprs - 1) / 8));
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+ S_00B22C_USER_SGPR(num_user_sgprs));
+
+ sctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE;
+}
+