etnaviv: update Android build files

[mesa.git] / src / gallium / drivers / radeonsi / si_state_shaders.c
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index 04443db7a440964b7a55d4b3830b06b52f4e7ad8..69827d0774bcb2b825c510663560c6153fbb5990 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -27,7 +27,6 @@
  
  #include "compiler/nir/nir_serialize.h"
  #include "nir/tgsi_to_nir.h"
-#include "tgsi/tgsi_parse.h"
  #include "util/hash_table.h"
  #include "util/crc32.h"
  #include "util/u_async_debug.h"
@@ -42,24 +41,23 @@
  /* SHADER_CACHE */
  
  /**
- * Return the IR binary in a buffer. For TGSI the first 4 bytes contain its
- * size as integer.
+ * Return the IR key for the shader cache.
   */
-void *si_get_ir_binary(struct si_shader_selector *sel, bool ngg, bool es)
+void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
+                        unsigned char ir_sha1_cache_key[20])
  {
-       struct blob blob;
+       struct blob blob = {};
         unsigned ir_size;
         void *ir_binary;
  
-       if (sel->tokens) {
-               ir_binary = sel->tokens;
-               ir_size = tgsi_num_tokens(sel->tokens) *
-                                         sizeof(struct tgsi_token);
+       if (sel->nir_binary) {
+               ir_binary = sel->nir_binary;
+               ir_size = sel->nir_size;
         } else {
                 assert(sel->nir);
  
                 blob_init(&blob);
-               nir_serialize(&blob, sel->nir);
+               nir_serialize(&blob, sel->nir, true);
                 ir_binary = blob.data;
                 ir_size = blob.size;
         }
@@ -78,20 +76,18 @@ void *si_get_ir_binary(struct si_shader_selector *sel, bool ngg, bool es)
         if (sel->force_correct_derivs_after_kill)
                 shader_variant_flags |= 1 << 3;
  
-       unsigned size = 4 + 4 + ir_size + sizeof(sel->so);
-       char *result = (char*)MALLOC(size);
-       if (!result)
-               return NULL;
-
-       ((uint32_t*)result)[0] = size;
-       ((uint32_t*)result)[1] = shader_variant_flags;
-       memcpy(result + 8, ir_binary, ir_size);
-       memcpy(result + 8 + ir_size, &sel->so, sizeof(sel->so));
-
-       if (sel->nir)
+       struct mesa_sha1 ctx;
+       _mesa_sha1_init(&ctx);
+       _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
+       _mesa_sha1_update(&ctx, ir_binary, ir_size);
+       if (sel->type == PIPE_SHADER_VERTEX ||
+           sel->type == PIPE_SHADER_TESS_EVAL ||
+           sel->type == PIPE_SHADER_GEOMETRY)
+               _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
+       _mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+       if (ir_binary == blob.data)
                 blob_finish(&blob);
-
-       return result;
  }
  
  /** Copy "data" to "ptr" and return the next dword following copied data. */
@@ -208,10 +204,9 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary)
  /**
   * Insert a shader into the cache. It's assumed the shader is not in the cache.
   * Use si_shader_cache_load_shader before calling this.
- *
- * Returns false on failure, in which case the ir_binary should be freed.
   */
-bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+void si_shader_cache_insert_shader(struct si_screen *sscreen,
+                                  unsigned char ir_sha1_cache_key[20],
                                    struct si_shader *shader,
                                    bool insert_into_disk_cache)
  {
@@ -219,42 +214,41 @@ bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
         struct hash_entry *entry;
         uint8_t key[CACHE_KEY_SIZE];
  
-       entry = _mesa_hash_table_search(sscreen->shader_cache, ir_binary);
+       entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
         if (entry)
-               return false; /* already added */
+               return; /* already added */
  
         hw_binary = si_get_shader_binary(shader);
         if (!hw_binary)
-               return false;
+               return;
  
-       if (_mesa_hash_table_insert(sscreen->shader_cache, ir_binary,
+       if (_mesa_hash_table_insert(sscreen->shader_cache,
+                                   mem_dup(ir_sha1_cache_key, 20),
                                     hw_binary) == NULL) {
                 FREE(hw_binary);
-               return false;
+               return;
         }
  
         if (sscreen->disk_shader_cache && insert_into_disk_cache) {
-               disk_cache_compute_key(sscreen->disk_shader_cache, ir_binary,
-                                      *((uint32_t *)ir_binary), key);
+               disk_cache_compute_key(sscreen->disk_shader_cache,
+                                      ir_sha1_cache_key, 20, key);
                 disk_cache_put(sscreen->disk_shader_cache, key, hw_binary,
                                *((uint32_t *) hw_binary), NULL);
         }
-
-       return true;
  }
  
-bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+bool si_shader_cache_load_shader(struct si_screen *sscreen,
+                                unsigned char ir_sha1_cache_key[20],
                                  struct si_shader *shader)
  {
         struct hash_entry *entry =
-               _mesa_hash_table_search(sscreen->shader_cache, ir_binary);
+               _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
         if (!entry) {
                 if (sscreen->disk_shader_cache) {
                         unsigned char sha1[CACHE_KEY_SIZE];
-                       size_t tg_size = *((uint32_t *) ir_binary);
  
                         disk_cache_compute_key(sscreen->disk_shader_cache,
-                                              ir_binary, tg_size, sha1);
+                                              ir_sha1_cache_key, 20, sha1);
  
                         size_t binary_size;
                         uint8_t *buffer =
@@ -285,16 +279,13 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
                         }
                         free(buffer);
  
-                       if (!si_shader_cache_insert_shader(sscreen, ir_binary,
-                                                          shader, false))
-                               FREE(ir_binary);
+                       si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
+                                                     shader, false);
                 } else {
                         return false;
                 }
         } else {
-               if (si_load_shader_binary(shader, entry->data))
-                       FREE(ir_binary);
-               else
+               if (!si_load_shader_binary(shader, entry->data))
                         return false;
         }
         p_atomic_inc(&sscreen->num_shader_cache_hits);
@@ -303,20 +294,14 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
  
  static uint32_t si_shader_cache_key_hash(const void *key)
  {
-       /* The first dword is the key size. */
-       return util_hash_crc32(key, *(uint32_t*)key);
+       /* Take the first dword of SHA1. */
+       return *(uint32_t*)key;
  }
  
  static bool si_shader_cache_key_equals(const void *a, const void *b)
  {
-       uint32_t *keya = (uint32_t*)a;
-       uint32_t *keyb = (uint32_t*)b;
-
-       /* The first dword is the key size. */
-       if (*keya != *keyb)
-               return false;
-
-       return memcmp(keya, keyb, *keya) == 0;
+       /* Compare SHA1s. */
+       return memcmp(a, b, 20) == 0;
  }
  
  static void si_destroy_shader_cache_entry(struct hash_entry *entry)
@@ -350,7 +335,7 @@ static void si_set_tesseval_regs(struct si_screen *sscreen,
                                  const struct si_shader_selector *tes,
                                  struct si_pm4_state *pm4)
  {
-       const struct tgsi_shader_info *info = &tes->info;
+       const struct si_shader_info *info = &tes->info;
         unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
         unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
         bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
@@ -472,8 +457,19 @@ static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
         }
  }
  
-static unsigned si_get_num_vs_user_sgprs(unsigned num_always_on_user_sgprs)
+static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
+                                        unsigned num_always_on_user_sgprs)
  {
+       struct si_shader_selector *vs = shader->previous_stage_sel ?
+                       shader->previous_stage_sel : shader->selector;
+       unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+
+       /* 1 SGPR is reserved for the vertex buffer pointer. */
+       assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+
+       if (num_vbos_in_user_sgprs)
+               return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+
         /* Add the pointer to VBO descriptors. */
         return num_always_on_user_sgprs + 1;
  }
@@ -525,7 +521,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
                            S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
                            S_00B528_DX10_CLAMP(1) |
                            S_00B528_FLOAT_MODE(shader->config.float_mode);
-       shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR)) |
+       shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
                            S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
  }
  
@@ -551,7 +547,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
                 }
  
                 unsigned num_user_sgprs =
-                       si_get_num_vs_user_sgprs(GFX9_TCS_NUM_USER_SGPR);
+                       si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
  
                 shader->config.rsrc2 =
                         S_00B42C_USER_SGPR(num_user_sgprs) |
@@ -635,7 +631,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
  
         if (shader->selector->type == PIPE_SHADER_VERTEX) {
                 vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-               num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR);
+               num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
         } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
                 vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
                 num_user_sgprs = SI_TES_NUM_USER_SGPR;
@@ -902,7 +898,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
  
                 unsigned num_user_sgprs;
                 if (es_type == PIPE_SHADER_VERTEX)
-                       num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
+                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
                 else
                         num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
  
@@ -938,6 +934,12 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
                 si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
                 si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
  
+               if (sscreen->info.chip_class >= GFX10) {
+                       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                                      S_00B204_CU_EN(0xffff) |
+                                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+               }
+
                 shader->ctx_reg.gs.vgt_gs_onchip_cntl =
                         S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
                         S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
@@ -967,6 +969,29 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
         }
  }
  
+static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
+{
+       enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
+
+       if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+           sctx->tracked_regs.reg_value[reg] != value) {
+               struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+               if (sctx->family == CHIP_NAVI10 ||
+                   sctx->family == CHIP_NAVI12 ||
+                   sctx->family == CHIP_NAVI14) {
+                       /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+                       radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
+               }
+
+               radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
+
+               sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+               sctx->tracked_regs.reg_value[reg] = value;
+       }
+}
+
  /* Common tail code for NGG primitive shaders. */
  static void gfx10_emit_shader_ngg_tail(struct si_context *sctx,
                                        struct si_shader *shader,
@@ -1011,6 +1036,9 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx,
  
         if (initial_cdw != sctx->gfx_cs->current.cdw)
                 sctx->context_roll = true;
+
+       /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+       gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
  }
  
  static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
@@ -1109,11 +1137,11 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ng
  static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
  {
         const struct si_shader_selector *gs_sel = shader->selector;
-       const struct tgsi_shader_info *gs_info = &gs_sel->info;
+       const struct si_shader_info *gs_info = &gs_sel->info;
         enum pipe_shader_type gs_type = shader->selector->type;
         const struct si_shader_selector *es_sel =
                 shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
-       const struct tgsi_shader_info *es_info = &es_sel->info;
+       const struct si_shader_info *es_info = &es_sel->info;
         enum pipe_shader_type es_type = es_sel->type;
         unsigned num_user_sgprs;
         unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
@@ -1146,7 +1174,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                         num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
                                          es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
                 } else {
-                       num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR);
+                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
                 }
         } else {
                 assert(es_type == PIPE_SHADER_TESS_EVAL);
@@ -1164,11 +1192,13 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
          * pass edge flags for decomposed primitives (such as quads) to the PA
          * for the GL_LINE polygon mode to skip rendering lines on inner edges.
          */
-       if (gs_info->uses_invocationid || gs_type == PIPE_SHADER_VERTEX)
+       if (gs_info->uses_invocationid ||
+           (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
                 gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
-       else if (gs_info->uses_primid)
+       else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
+                (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
                 gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-       else if (input_prim >= PIPE_PRIM_TRIANGLES)
+       else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
                 gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
         else
                 gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
@@ -1191,6 +1221,28 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                        S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
                        S_00B22C_LDS_SIZE(shader->config.lds_size));
  
+       /* Determine LATE_ALLOC_GS. */
+       unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+       unsigned late_alloc_wave64; /* The limit is per SH. */
+
+       /* For Wave32, the hw will launch twice the number of late
+        * alloc waves, so 1 == 2x wave32.
+        *
+        * Don't use late alloc for NGG on Navi14 due to a hw bug.
+        */
+       if (sscreen->info.family == CHIP_NAVI14)
+               late_alloc_wave64 = 0;
+       else if (num_cu_per_sh <= 6)
+               late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+       else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+               late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+       else
+               late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+       si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                      S_00B204_CU_EN(0xffff) |
+                      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
         nparams = MAX2(shader->info.nr_param_exports, 1);
         shader->ctx_reg.ngg.spi_vs_out_config =
                 S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
@@ -1212,7 +1264,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
  
         shader->ctx_reg.ngg.vgt_primitiveid_en =
                 S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
-               S_028A84_NGG_DISABLE_PROVOK_REUSE(es_enable_prim_id);
+               S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
+                                                 gs_sel->info.writes_primid);
  
         if (gs_type == PIPE_SHADER_GEOMETRY) {
                 shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
@@ -1248,26 +1301,53 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                 S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
         shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
  
-       shader->ge_cntl =
-               S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-               S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
-               S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+       /* Oversubscribe PC. This improves performance when there are too many varyings. */
+       float oversub_pc_factor = 0.25;
  
-       /* Bug workaround for a possible hang with non-tessellation cases.
-        * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
-        *
-        * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
-        */
-       if ((sscreen->info.family == CHIP_NAVI10 ||
-            sscreen->info.family == CHIP_NAVI12 ||
-            sscreen->info.family == CHIP_NAVI14) &&
-           (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
-           shader->ngg.hw_max_esverts != 256) {
-               shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
-               if (shader->ngg.hw_max_esverts > 5) {
-                       shader->ge_cntl |=
-                               S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+       if (shader->key.opt.ngg_culling) {
+               /* Be more aggressive with NGG culling. */
+               if (shader->info.nr_param_exports > 4)
+                       oversub_pc_factor = 1;
+               else if (shader->info.nr_param_exports > 2)
+                       oversub_pc_factor = 0.75;
+               else
+                       oversub_pc_factor = 0.5;
+       }
+
+       unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
+       shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(1) |
+                                         S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
+
+       if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+               shader->ge_cntl =
+                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                       S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+       } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+               shader->ge_cntl =
+                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                       S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+       } else {
+               shader->ge_cntl =
+                       S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                       S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+                       S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+
+               /* Bug workaround for a possible hang with non-tessellation cases.
+                * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+                *
+                * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+                */
+               if ((sscreen->info.family == CHIP_NAVI10 ||
+                    sscreen->info.family == CHIP_NAVI12 ||
+                    sscreen->info.family == CHIP_NAVI14) &&
+                   (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
+                   shader->ngg.hw_max_esverts != 256) {
+                       shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+                       if (shader->ngg.hw_max_esverts > 5) {
+                               shader->ge_cntl |=
+                                       S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+                       }
                 }
         }
  
@@ -1326,9 +1406,6 @@ static void si_emit_shader_vs(struct si_context *sctx)
                                            SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
                                            shader->vgt_vertex_reuse_block_cntl);
  
-       if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll = true;
-
         /* Required programming for tessellation. (legacy pipeline only) */
         if (sctx->chip_class == GFX10 &&
             shader->selector->type == PIPE_SHADER_TESS_EVAL) {
@@ -1345,6 +1422,13 @@ static void si_emit_shader_vs(struct si_context *sctx)
                                                shader->pa_cl_vs_out_cntl,
                                                SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
         }
+
+       if (initial_cdw != sctx->gfx_cs->current.cdw)
+               sctx->context_roll = true;
+
+       /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+       if (sctx->chip_class >= GFX10)
+               gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
  }
  
  /**
@@ -1357,7 +1441,7 @@ static void si_emit_shader_vs(struct si_context *sctx)
  static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                           struct si_shader_selector *gs)
  {
-       const struct tgsi_shader_info *info = &shader->selector->info;
+       const struct si_shader_info *info = &shader->selector->info;
         struct si_pm4_state *pm4;
         unsigned num_user_sgprs, vgpr_comp_cnt;
         uint64_t va;
@@ -1413,7 +1497,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                         num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
                                          info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
                 } else {
-                       num_user_sgprs = si_get_num_vs_user_sgprs(SI_VS_NUM_USER_SGPR);
+                       num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
                 }
         } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
                 vgpr_comp_cnt = enable_prim_id ? 3 : 2;
@@ -1441,6 +1525,8 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                         S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
                                                     V_02870C_SPI_SHADER_4COMP :
                                                     V_02870C_SPI_SHADER_NONE);
+       shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(1) |
+                                        S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
         shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
  
         oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
@@ -1458,6 +1544,11 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                          S_00B12C_OC_LDS_EN(oc_lds_en) |
                          S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
  
+       if (sscreen->info.chip_class >= GFX10)
+               rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+       else if (sscreen->info.chip_class == GFX9)
+               rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
         if (sscreen->info.chip_class <= GFX9)
                 rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
  
@@ -1490,7 +1581,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
  
  static unsigned si_get_ps_num_interp(struct si_shader *ps)
  {
-       struct tgsi_shader_info *info = &ps->selector->info;
+       struct si_shader_info *info = &ps->selector->info;
         unsigned num_colors = !!(info->colors_read & 0x0f) +
                               !!(info->colors_read & 0xf0);
         unsigned num_interp = ps->selector->info.num_inputs +
@@ -1552,7 +1643,7 @@ static void si_emit_shader_ps(struct si_context *sctx)
  
  static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
  {
-       struct tgsi_shader_info *info = &shader->selector->info;
+       struct si_shader_info *info = &shader->selector->info;
         struct si_pm4_state *pm4;
         unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
         unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
@@ -1836,6 +1927,7 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx,
         uint64_t linked = outputs_written & inputs_read;
  
         key->opt.kill_outputs = ~linked & outputs_written;
+       key->opt.ngg_culling = sctx->ngg_culling;
  }
  
  /* Compute the key for the hw shader variant */
@@ -1889,7 +1981,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                 key->part.tcs.epilog.prim_mode =
                         sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
                 key->part.tcs.epilog.invoc0_tess_factors_are_def =
-                       sel->tcs_info.tessfactors_are_def_in_all_invocs;
+                       sel->info.tessfactors_are_def_in_all_invocs;
                 key->part.tcs.epilog.tes_reads_tess_factors =
                         sctx->tes_shader.cso->info.reads_tess_factors;
  
@@ -2105,7 +2197,10 @@ static void si_build_shader_variant(struct si_shader *shader,
                 compiler = shader->compiler_ctx_state.compiler;
         }
  
-       if (unlikely(!si_shader_create(sscreen, compiler, shader, debug))) {
+       if (!compiler->passes)
+               si_init_compiler(sscreen, compiler);
+
+       if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
                 PRINT_ERR("Failed to build shader variant (type=%u)\n",
                           sel->type);
                 shader->compilation_failed = true;
@@ -2159,8 +2254,8 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
                 main_part->key.as_ngg = key->as_ngg;
                 main_part->is_monolithic = false;
  
-               if (si_compile_tgsi_shader(sscreen, compiler_state->compiler,
-                                          main_part, &compiler_state->debug) != 0) {
+               if (!si_compile_shader(sscreen, compiler_state->compiler,
+                                      main_part, &compiler_state->debug)) {
                         FREE(main_part);
                         return false;
                 }
@@ -2420,7 +2515,7 @@ static int si_shader_select(struct pipe_context *ctx,
                                          &key, -1, false);
  }
  
-static void si_parse_next_shader_property(const struct tgsi_shader_info *info,
+static void si_parse_next_shader_property(const struct si_shader_info *info,
                                           bool streamout,
                                           struct si_shader_key *key)
  {
@@ -2472,8 +2567,25 @@ static void si_init_shader_selector_async(void *job, int thread_index)
         assert(thread_index < ARRAY_SIZE(sscreen->compiler));
         compiler = &sscreen->compiler[thread_index];
  
-       if (sel->nir)
-               si_lower_nir(sel);
+       if (!compiler->passes)
+               si_init_compiler(sscreen, compiler);
+
+       /* Serialize NIR to save memory. Monolithic shader variants
+        * have to deserialize NIR before compilation.
+        */
+       if (sel->nir) {
+               struct blob blob;
+                size_t size;
+
+               blob_init(&blob);
+               /* true = remove optional debugging data to increase
+                * the likehood of getting more shader cache hits.
+                * It also drops variable names, so we'll save more memory.
+                */
+               nir_serialize(&blob, sel->nir, true);
+               blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
+               sel->nir_size = size;
+       }
  
         /* Compile the main shader part for use with a prolog and/or epilog.
          * If this fails, the driver will try to compile a monolithic shader
@@ -2481,7 +2593,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
          */
         if (!sscreen->use_monolithic_shaders) {
                 struct si_shader *shader = CALLOC_STRUCT(si_shader);
-               void *ir_binary = NULL;
+               unsigned char ir_sha1_cache_key[20];
  
                 if (!shader) {
                         fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
@@ -2505,36 +2617,31 @@ static void si_init_shader_selector_async(void *job, int thread_index)
                      sel->type == PIPE_SHADER_GEOMETRY))
                         shader->key.as_ngg = 1;
  
-               if (sel->tokens || sel->nir) {
-                       ir_binary = si_get_ir_binary(sel, shader->key.as_ngg,
-                                                    shader->key.as_es);
+               if (sel->nir) {
+                       si_get_ir_cache_key(sel, shader->key.as_ngg,
+                                           shader->key.as_es, ir_sha1_cache_key);
                 }
  
                 /* Try to load the shader from the shader cache. */
                 simple_mtx_lock(&sscreen->shader_cache_mutex);
  
-               if (ir_binary &&
-                   si_shader_cache_load_shader(sscreen, ir_binary, shader)) {
+               if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
                         simple_mtx_unlock(&sscreen->shader_cache_mutex);
                         si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
                 } else {
                         simple_mtx_unlock(&sscreen->shader_cache_mutex);
  
                         /* Compile the shader if it hasn't been loaded from the cache. */
-                       if (si_compile_tgsi_shader(sscreen, compiler, shader,
-                                                  debug) != 0) {
+                       if (!si_compile_shader(sscreen, compiler, shader, debug)) {
                                 FREE(shader);
-                               FREE(ir_binary);
                                 fprintf(stderr, "radeonsi: can't compile a main shader part\n");
                                 return;
                         }
  
-                       if (ir_binary) {
-                               simple_mtx_lock(&sscreen->shader_cache_mutex);
-                               if (!si_shader_cache_insert_shader(sscreen, ir_binary, shader, true))
-                                       FREE(ir_binary);
-                               simple_mtx_unlock(&sscreen->shader_cache_mutex);
-                       }
+                       simple_mtx_lock(&sscreen->shader_cache_mutex);
+                       si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
+                                                     shader, true);
+                       simple_mtx_unlock(&sscreen->shader_cache_mutex);
                 }
  
                 *si_get_main_shader_part(sel, &shader->key) = shader;
@@ -2595,6 +2702,12 @@ static void si_init_shader_selector_async(void *job, int thread_index)
  
                 si_shader_vs(sscreen, sel->gs_copy_shader, sel);
         }
+
+       /* Free NIR. We only keep serialized NIR after this point. */
+       if (sel->nir) {
+               ralloc_free(sel->nir);
+               sel->nir = NULL;
+       }
  }
  
  void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
@@ -2629,7 +2742,7 @@ void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
  }
  
  /* Return descriptor slot usage masks from the given shader info. */
-void si_get_active_slot_masks(const struct tgsi_shader_info *info,
+void si_get_active_slot_masks(const struct si_shader_info *info,
                               uint32_t *const_and_shader_buffers,
                               uint64_t *samplers_and_images)
  {
@@ -2682,45 +2795,16 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
  
         sel->so = state->stream_output;
  
-       if (state->type == PIPE_SHADER_IR_TGSI &&
-           !sscreen->options.enable_nir) {
-               sel->tokens = tgsi_dup_tokens(state->tokens);
-               if (!sel->tokens) {
-                       FREE(sel);
-                       return NULL;
-               }
-
-               tgsi_scan_shader(state->tokens, &sel->info);
-               tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info);
-
-               /* Fixup for TGSI: Set which opcode uses which (i,j) pair. */
-               if (sel->info.uses_persp_opcode_interp_centroid)
-                       sel->info.uses_persp_centroid = true;
-
-               if (sel->info.uses_linear_opcode_interp_centroid)
-                       sel->info.uses_linear_centroid = true;
-
-               if (sel->info.uses_persp_opcode_interp_offset ||
-                   sel->info.uses_persp_opcode_interp_sample)
-                       sel->info.uses_persp_center = true;
-
-               if (sel->info.uses_linear_opcode_interp_offset ||
-                   sel->info.uses_linear_opcode_interp_sample)
-                       sel->info.uses_linear_center = true;
+       if (state->type == PIPE_SHADER_IR_TGSI) {
+               sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
         } else {
-               if (state->type == PIPE_SHADER_IR_TGSI) {
-                       sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
-               } else {
-                       assert(state->type == PIPE_SHADER_IR_NIR);
-                       sel->nir = state->ir.nir;
-               }
-
-               si_nir_lower_ps_inputs(sel->nir);
-               si_nir_opts(sel->nir);
-               si_nir_scan_shader(sel->nir, &sel->info);
-               si_nir_scan_tess_ctrl(sel->nir, &sel->tcs_info);
+               assert(state->type == PIPE_SHADER_IR_NIR);
+               sel->nir = state->ir.nir;
         }
  
+       si_nir_scan_shader(sel->nir, &sel->info);
+       si_nir_adjust_driver_locations(sel->nir);
+
         sel->type = sel->info.processor;
         p_atomic_inc(&sscreen->num_shaders_created);
         si_get_active_slot_masks(&sel->info,
@@ -2734,6 +2818,12 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                         (sel->so.output[i].stream * 4);
         }
  
+       sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
+                            !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
+                                    sel->info.num_inputs : 0;
+       sel->num_vbos_in_user_sgprs =
+               MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
+
         /* The prolog is a no-op if there are no inputs. */
         sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
                                sel->info.num_inputs &&
@@ -2782,9 +2872,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
  
                 /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
                 sel->tess_turns_off_ngg =
-                       (sscreen->info.family == CHIP_NAVI10 ||
-                        sscreen->info.family == CHIP_NAVI12 ||
-                        sscreen->info.family == CHIP_NAVI14) &&
+                       sscreen->info.chip_class == GFX10 &&
                         sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
                 break;
  
@@ -2883,6 +2971,20 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
         default:;
         }
  
+       sel->ngg_culling_allowed =
+               sscreen->info.chip_class == GFX10 &&
+               sscreen->info.has_dedicated_vram &&
+               sscreen->use_ngg_culling &&
+               /* Disallow TES by default, because TessMark results are mixed. */
+               (sel->type == PIPE_SHADER_VERTEX ||
+                (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
+               sel->info.writes_position &&
+               !sel->info.writes_viewport_index && /* cull only against viewport 0 */
+               !sel->info.writes_memory &&
+               !sel->so.num_outputs &&
+               !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
+               !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
         /* PA_CL_VS_OUT_CNTL */
         if (sctx->chip_class <= GFX9)
                 sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
@@ -3072,7 +3174,7 @@ bool si_update_ngg(struct si_context *sctx)
                         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
  
                 sctx->ngg = new_ngg;
-               sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+               sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
                 return true;
         }
         return false;
@@ -3095,7 +3197,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
         sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
  
         si_update_common_shader_state(sctx);
-       sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+       sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
  
         ngg_changed = si_update_ngg(sctx);
         if (ngg_changed || enable_changed)
@@ -3149,7 +3251,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
         si_update_tess_uses_prim_id(sctx);
  
         si_update_common_shader_state(sctx);
-       sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+       sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
  
         bool ngg_changed = si_update_ngg(sctx);
         if (ngg_changed || enable_changed)
@@ -3294,8 +3396,8 @@ void si_destroy_shader_selector(struct si_context *sctx,
  
         util_queue_fence_destroy(&sel->ready);
         simple_mtx_destroy(&sel->mutex);
-       free(sel->tokens);
         ralloc_free(sel->nir);
+       free(sel->nir_binary);
         free(sel);
  }
  
@@ -3311,7 +3413,7 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx,
                                      struct si_shader *vs, unsigned name,
                                      unsigned index, unsigned interpolate)
  {
-       struct tgsi_shader_info *vsinfo = &vs->selector->info;
+       struct si_shader_info *vsinfo = &vs->selector->info;
         unsigned j, offset, ps_input_cntl = 0;
  
         if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
@@ -3370,7 +3472,7 @@ static void si_emit_spi_map(struct si_context *sctx)
  {
         struct si_shader *ps = sctx->ps_shader.current;
         struct si_shader *vs = si_get_vs_state(sctx);
-       struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL;
+       struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
         unsigned i, num_interp, num_written = 0, bcol_interp[2];
         unsigned spi_ps_input_cntl[32];
  
@@ -3862,9 +3964,10 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
         }
  
         if (key.u.ngg) {
-               stages |= S_028B54_PRIMGEN_EN(1);
-               if (key.u.streamout)
-                       stages |= S_028B54_NGG_WAVE_ID_EN(1);
+               stages |= S_028B54_PRIMGEN_EN(1) |
+                         S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+                         S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
+                         S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
         } else if (key.u.gs)
                 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
  
@@ -3904,6 +4007,9 @@ bool si_update_shaders(struct si_context *sctx)
                 old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
         int r;
  
+       if (!sctx->compiler.passes)
+               si_init_compiler(sctx->screen, &sctx->compiler);
+
         compiler_state.compiler = &sctx->compiler;
         compiler_state.debug = sctx->debug;
         compiler_state.is_debug_context = sctx->is_debug;
@@ -4014,6 +4120,15 @@ bool si_update_shaders(struct si_context *sctx)
                 }
         }
  
+       /* This must be done after the shader variant is selected. */
+       if (sctx->ngg) {
+               struct si_shader *vs = si_get_vs(sctx)->current;
+
+               key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
+               key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
+                                             SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+       }
+
         si_update_vgt_shader_config(sctx, key);
  
         if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)