freedreno/ir3: limit pre-fetched tex dest

[mesa.git] / src / freedreno / ir3 / ir3_nir_lower_tess.c
diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c

index b4982503f0d568e1f3de2b86955fe861939762dc..9f4985bc34f4636e3ff99e0536b9ea0d0ccf35bc 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@@ -26,6 +26,8 @@
  #include "compiler/nir/nir_builder.h"
  
  struct state {
+       uint32_t topology;
+
         struct primitive_map {
                 unsigned loc[32];
                 unsigned size[32];
@@ -36,10 +38,13 @@ struct state {
  
         nir_variable *vertex_count_var;
         nir_variable *emitted_vertex_var;
-       nir_variable *vertex_flags_var;
         nir_variable *vertex_flags_out;
  
-       nir_variable *output_vars[32];
+       struct exec_list old_outputs;
+       struct exec_list emit_outputs;
+
+       nir_ssa_def *outer_levels[4];
+       nir_ssa_def *inner_levels[2];
  };
  
  static nir_ssa_def *
@@ -70,7 +75,7 @@ build_local_primitive_id(nir_builder *b, struct state *state)
  static nir_variable *
  get_var(struct exec_list *list, int driver_location)
  {
-       nir_foreach_variable(v, list) {
+       nir_foreach_variable (v, list) {
                 if (v->data.driver_location == driver_location) {
                         return v;
                 }
@@ -85,21 +90,26 @@ build_local_offset(nir_builder *b, struct state *state,
  {
         nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
         nir_ssa_def *primitive_offset =
-               nir_imul(b, build_local_primitive_id(b, state), primitive_stride);
+               nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
         nir_ssa_def *attr_offset;
         nir_ssa_def *vertex_stride;
  
-       if (b->shader->info.stage == MESA_SHADER_VERTEX) {
+       switch (b->shader->info.stage) {
+       case MESA_SHADER_VERTEX:
+       case MESA_SHADER_TESS_EVAL:
                 vertex_stride = nir_imm_int(b, state->map.stride * 4);
                 attr_offset = nir_imm_int(b, state->map.loc[base] * 4);
-       } else if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
+               break;
+       case MESA_SHADER_TESS_CTRL:
+       case MESA_SHADER_GEOMETRY:
                 vertex_stride = nir_load_vs_vertex_stride_ir3(b);
                 attr_offset = nir_load_primitive_location_ir3(b, base);
-       } else {
+               break;
+       default:
                 unreachable("bad shader stage");
         }
  
-       nir_ssa_def *vertex_offset = nir_imul(b, vertex, vertex_stride);
+       nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
  
         return nir_iadd(b, nir_iadd(b, primitive_offset, vertex_offset),
                         nir_iadd(b, attr_offset, offset));
@@ -137,7 +147,7 @@ replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
  static void
  build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_list *list)
  {
-       nir_foreach_variable(var, list) {
+       nir_foreach_variable (var, list) {
                 switch (var->data.location) {
                 case VARYING_SLOT_TESS_LEVEL_OUTER:
                 case VARYING_SLOT_TESS_LEVEL_INNER:
@@ -169,9 +179,9 @@ build_primitive_map(nir_shader *shader, struct primitive_map *map, struct exec_l
  }
  
  static void
-lower_vs_block(nir_block *block, nir_builder *b, struct state *state)
+lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *state)
  {
-       nir_foreach_instr_safe(instr, block) {
+       nir_foreach_instr_safe (instr, block) {
                 if (instr->type != nir_instr_type_intrinsic)
                         continue;
  
@@ -181,7 +191,14 @@ lower_vs_block(nir_block *block, nir_builder *b, struct state *state)
                 case nir_intrinsic_store_output: {
                         // src[] = { value, offset }.
  
-                       b->cursor = nir_before_instr(&intr->instr);
+                       /* nir_lower_io_to_temporaries replaces all access to output
+                        * variables with temp variables and then emits a nir_copy_var at
+                        * the end of the shader.  Thus, we should always get a full wrmask
+                        * here.
+                        */
+                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+                       b->cursor = nir_instr_remove(&intr->instr);
  
                         nir_ssa_def *vertex_id = build_vertex_id(b, state);
                         nir_ssa_def *offset = build_local_offset(b, state, vertex_id, nir_intrinsic_base(intr),
@@ -189,10 +206,8 @@ lower_vs_block(nir_block *block, nir_builder *b, struct state *state)
                         nir_intrinsic_instr *store =
                                 nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3);
  
-                       nir_intrinsic_set_write_mask(store, MASK(intr->num_components));
                         store->src[0] = nir_src_for_ssa(intr->src[0].ssa);
                         store->src[1] = nir_src_for_ssa(offset);
-
                         store->num_components = intr->num_components;
  
                         nir_builder_instr_insert(b, &store->instr);
@@ -212,7 +227,7 @@ local_thread_id(nir_builder *b)
  }
  
  void
-ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s)
+ir3_nir_lower_to_explicit_output(nir_shader *shader, struct ir3_shader *s, unsigned topology)
  {
         struct state state = { };
  
@@ -226,10 +241,13 @@ ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s)
         nir_builder_init(&b, impl);
         b.cursor = nir_before_cf_list(&impl->body);
  
-       state.header = nir_load_gs_header_ir3(&b);
+       if (s->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
+               state.header = nir_load_tcs_header_ir3(&b);
+       else
+               state.header = nir_load_gs_header_ir3(&b);
  
-       nir_foreach_block_safe(block, impl)
-               lower_vs_block(block, &b, &state);
+       nir_foreach_block_safe (block, impl)
+               lower_block_to_explicit_output(block, &b, &state);
  
         nir_metadata_preserve(impl, nir_metadata_block_index |
                         nir_metadata_dominance);
@@ -237,90 +255,273 @@ ir3_nir_lower_vs_to_explicit_io(nir_shader *shader, struct ir3_shader *s)
         s->output_size = state.map.stride;
  }
  
+
  static void
-lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
+lower_block_to_explicit_input(nir_block *block, nir_builder *b, struct state *state)
  {
-       nir_intrinsic_instr *outputs[32] = {};
-
-       nir_foreach_instr_safe(instr, block) {
+       nir_foreach_instr_safe (instr, block) {
                 if (instr->type != nir_instr_type_intrinsic)
                         continue;
  
                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  
                 switch (intr->intrinsic) {
-               case nir_intrinsic_store_output: {
-                       // src[] = { value, offset }.
+               case nir_intrinsic_load_per_vertex_input: {
+                       // src[] = { vertex, offset }.
  
-                       uint32_t loc = nir_intrinsic_base(intr);
-                       outputs[loc] = intr;
+                       b->cursor = nir_before_instr(&intr->instr);
+
+                       nir_ssa_def *offset = build_local_offset(b, state,
+                                       intr->src[0].ssa, // this is typically gl_InvocationID
+                                       nir_intrinsic_base(intr),
+                                       intr->src[1].ssa);
+
+                       replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL);
                         break;
                 }
  
-               case nir_intrinsic_end_primitive: {
+               case nir_intrinsic_load_invocation_id: {
                         b->cursor = nir_before_instr(&intr->instr);
-                       nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 4), 0x1);
+
+                       nir_ssa_def *iid = build_invocation_id(b, state);
+                       nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid));
                         nir_instr_remove(&intr->instr);
                         break;
                 }
  
-               case nir_intrinsic_emit_vertex: {
+               default:
+                       break;
+               }
+       }
+}
  
-                       /* Load the vertex count */
-                       b->cursor = nir_before_instr(&intr->instr);
-                       nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
+void
+ir3_nir_lower_to_explicit_input(nir_shader *shader)
+{
+       struct state state = { };
  
-                       nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
+       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+       assert(impl);
  
-                       for (uint32_t i = 0; i < ARRAY_SIZE(outputs); i++) {
-                               if (outputs[i]) {
-                                       nir_store_var(b, state->output_vars[i],
-                                                       outputs[i]->src[0].ssa,
-                                                       (1 << outputs[i]->num_components) - 1);
+       nir_builder b;
+       nir_builder_init(&b, impl);
+       b.cursor = nir_before_cf_list(&impl->body);
  
-                                       nir_instr_remove(&outputs[i]->instr);
-                               }
-                               outputs[i] = NULL;
-                       }
+       if (shader->info.stage == MESA_SHADER_GEOMETRY)
+               state.header = nir_load_gs_header_ir3(&b);
+       else
+               state.header = nir_load_tcs_header_ir3(&b);
+
+       nir_foreach_block_safe (block, impl)
+               lower_block_to_explicit_input(block, &b, &state);
+}
+
+
+static nir_ssa_def *
+build_per_vertex_offset(nir_builder *b, struct state *state,
+               nir_ssa_def *vertex, nir_ssa_def *offset, nir_variable *var)
+{
+       nir_ssa_def *primitive_id = nir_load_primitive_id(b);
+       nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
+       nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, patch_stride);
+       nir_ssa_def *attr_offset;
+       int loc = var->data.driver_location;
+
+       switch (b->shader->info.stage) {
+       case MESA_SHADER_TESS_CTRL:
+               attr_offset = nir_imm_int(b, state->map.loc[loc]);
+               break;
+       case MESA_SHADER_TESS_EVAL:
+               attr_offset = nir_load_primitive_location_ir3(b, loc);
+               break;
+       default:
+               unreachable("bad shader state");
+       }
+
+       nir_ssa_def *attr_stride = nir_imm_int(b, state->map.size[loc]);
+       nir_ssa_def *vertex_offset = nir_imul24(b, vertex, attr_stride);
+
+       return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset),
+                       nir_iadd(b, vertex_offset, nir_ishl(b, offset, nir_imm_int(b, 2))));
+}
+
+static nir_ssa_def *
+build_patch_offset(nir_builder *b, struct state *state, nir_ssa_def *offset, nir_variable *var)
+{
+       debug_assert(var && var->data.patch);
+
+       return build_per_vertex_offset(b, state, nir_imm_int(b, 0), offset, var);
+}
+
+static nir_ssa_def *
+build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
+{
+       uint32_t inner_levels, outer_levels;
+       switch (state->topology) {
+       case IR3_TESS_TRIANGLES:
+               inner_levels = 1;
+               outer_levels = 3;
+               break;
+       case IR3_TESS_QUADS:
+               inner_levels = 2;
+               outer_levels = 4;
+               break;
+       case IR3_TESS_ISOLINES:
+               inner_levels = 0;
+               outer_levels = 2;
+               break;
+       default:
+               unreachable("bad");
+       }
+
+       const uint32_t patch_stride = 1 + inner_levels + outer_levels;
+
+       nir_ssa_def *primitive_id = nir_load_primitive_id(b);
+
+       nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, nir_imm_int(b, patch_stride));
+
+       uint32_t offset;
+       switch (slot) {
+       case VARYING_SLOT_TESS_LEVEL_OUTER:
+               /* There's some kind of header dword, tess levels start at index 1. */
+               offset = 1;
+               break;
+       case VARYING_SLOT_TESS_LEVEL_INNER:
+               offset = 1 + outer_levels;
+               break;
+       default:
+               unreachable("bad");
+       }
  
+       return nir_iadd(b, patch_offset, nir_imm_int(b, offset));
+}
+
+static void
+lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
+{
+       nir_foreach_instr_safe (instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                       continue;
+
+               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+               switch (intr->intrinsic) {
+               case nir_intrinsic_control_barrier:
+               case nir_intrinsic_memory_barrier_tcs_patch:
+                       /* Hull shaders dispatch 32 wide so an entire patch will always
+                        * fit in a single warp and execute in lock-step.  Consequently,
+                        * we don't need to do anything for TCS barriers so just remove
+                        * the intrinsic. Otherwise we'll emit an actual barrier
+                        * instructions, which will deadlock.
+                        */
                         nir_instr_remove(&intr->instr);
+                       break;
  
-                       nir_store_var(b, state->emitted_vertex_var,
-                                       nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1);
+               case nir_intrinsic_load_per_vertex_output: {
+                       // src[] = { vertex, offset }.
  
-                       nir_store_var(b, state->vertex_flags_out,
-                                       nir_load_var(b, state->vertex_flags_var), 0x1);
+                       b->cursor = nir_before_instr(&intr->instr);
  
-                       nir_pop_if(b, NULL);
+                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
+                       nir_ssa_def *offset = build_per_vertex_offset(b, state,
+                                       intr->src[0].ssa, intr->src[1].ssa, var);
  
-                       /* Increment the vertex count by 1 */
-                       nir_store_var(b, state->vertex_count_var,
-                                       nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
-                       nir_store_var(b, state->vertex_flags_var, nir_imm_int(b, 0), 0x1);
+                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
+                       break;
+               }
+
+               case nir_intrinsic_store_per_vertex_output: {
+                       // src[] = { value, vertex, offset }.
+
+                       b->cursor = nir_before_instr(&intr->instr);
+
+                       /* nir_lower_io_to_temporaries replaces all access to output
+                        * variables with temp variables and then emits a nir_copy_var at
+                        * the end of the shader.  Thus, we should always get a full wrmask
+                        * here.
+                        */
+                       assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+                       nir_ssa_def *value = intr->src[0].ssa;
+                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
+                       nir_ssa_def *offset = build_per_vertex_offset(b, state,
+                                       intr->src[1].ssa, intr->src[2].ssa, var);
+
+                       replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address,
+                                       nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr))));
  
                         break;
                 }
  
-               case nir_intrinsic_load_per_vertex_input: {
-                       // src[] = { vertex, offset }.
+               case nir_intrinsic_load_tess_level_inner:
+               case nir_intrinsic_load_tess_level_outer: {
+                       b->cursor = nir_before_instr(&intr->instr);
+
+                       gl_varying_slot slot;
+                       if (intr->intrinsic == nir_intrinsic_load_tess_level_inner)
+                               slot = VARYING_SLOT_TESS_LEVEL_INNER;
+                       else
+                               slot = VARYING_SLOT_TESS_LEVEL_OUTER;
+
+                       nir_ssa_def *address = nir_load_tess_factor_base_ir3(b);
+                       nir_ssa_def *offset = build_tessfactor_base(b, slot, state);
+
+                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
+                       break;
+               }
+
+               case nir_intrinsic_load_output: {
+                       // src[] = { offset }.
+
+                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
  
                         b->cursor = nir_before_instr(&intr->instr);
  
-                       nir_ssa_def *offset = build_local_offset(b, state,
-                                       intr->src[0].ssa, // this is typically gl_InvocationID
-                                       nir_intrinsic_base(intr),
-                                       intr->src[1].ssa);
+                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+                       nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var);
  
-                       replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL);
+                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
                         break;
                 }
  
-               case nir_intrinsic_load_invocation_id: {
+               case nir_intrinsic_store_output: {
+                       // src[] = { value, offset }.
+
+                       /* write patch output to bo */
+
+                       nir_variable *var = get_var(&b->shader->outputs, nir_intrinsic_base(intr));
+
+                       nir_ssa_def **levels = NULL;
+                       if (var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+                               levels = state->outer_levels;
+                       else if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER)
+                               levels = state->inner_levels;
+
                         b->cursor = nir_before_instr(&intr->instr);
  
-                       nir_ssa_def *iid = build_invocation_id(b, state);
-                       nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid));
-                       nir_instr_remove(&intr->instr);
+                       if (levels) {
+                               for (int i = 0; i < 4; i++)
+                                       if (nir_intrinsic_write_mask(intr) & (1 << i))
+                                               levels[i] = nir_channel(b, intr->src[0].ssa, i);
+                               nir_instr_remove(&intr->instr);
+                       } else {
+                               nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+                               nir_ssa_def *offset = build_patch_offset(b, state, intr->src[1].ssa, var);
+
+                               debug_assert(nir_intrinsic_component(intr) == 0);
+
+                               /* nir_lower_io_to_temporaries replaces all access to output
+                                * variables with temp variables and then emits a nir_copy_var at
+                                * the end of the shader.  Thus, we should always get a full wrmask
+                                * here.
+                                */
+                               assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
+
+                               replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+                                               intr->src[0].ssa, address, offset);
+                       }
                         break;
                 }
  
@@ -331,49 +532,320 @@ lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
  }
  
  static void
-emit_store_outputs(nir_builder *b, struct state *state)
+emit_tess_epilouge(nir_builder *b, struct state *state)
  {
-       /* This also stores the internally added vertex_flags output. */
+       nir_ssa_def *tessfactor_address = nir_load_tess_factor_base_ir3(b);
+       nir_ssa_def *levels[2];
+
+       if (!state->outer_levels[0])
+               return;
+
+       /* Then emit the epilogue that actually writes out the tessellation levels
+        * to the BOs.
+        */
+       switch (state->topology) {
+       case IR3_TESS_TRIANGLES:
+               levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1],
+                               state->outer_levels[2], state->inner_levels[0]);
+               levels[1] = NULL;
+               break;
+       case IR3_TESS_QUADS:
+               levels[0] = nir_vec4(b, state->outer_levels[0], state->outer_levels[1],
+                               state->outer_levels[2], state->outer_levels[3]);
+               levels[1] = nir_vec2(b, state->inner_levels[0], state->inner_levels[1]);
+               break;
+       case IR3_TESS_ISOLINES:
+               levels[0] = nir_vec2(b, state->outer_levels[0], state->outer_levels[1]);
+               levels[1] = NULL;
+               break;
+       default:
+               unreachable("nope");
+       }
  
-       for (uint32_t i = 0; i < ARRAY_SIZE(state->output_vars); i++) {
-               if (!state->output_vars[i])
-                       continue;
+       nir_ssa_def *offset = build_tessfactor_base(b, VARYING_SLOT_TESS_LEVEL_OUTER, state);
  
-               nir_intrinsic_instr *store =
-                       nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+       nir_intrinsic_instr *store =
+               nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
  
-               nir_intrinsic_set_base(store, i);
-               store->src[0] = nir_src_for_ssa(nir_load_var(b, state->output_vars[i]));
-               store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
-               store->num_components = store->src[0].ssa->num_components;
+       store->src[0] = nir_src_for_ssa(levels[0]);
+       store->src[1] = nir_src_for_ssa(tessfactor_address);
+       store->src[2] = nir_src_for_ssa(offset);
+       nir_builder_instr_insert(b, &store->instr);
+       store->num_components = levels[0]->num_components;
  
+       if (levels[1]) {
+               store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_global_ir3);
+               offset = nir_iadd(b, offset, nir_imm_int(b, levels[0]->num_components));
+
+               store->src[0] = nir_src_for_ssa(levels[1]);
+               store->src[1] = nir_src_for_ssa(tessfactor_address);
+               store->src[2] = nir_src_for_ssa(offset);
                 nir_builder_instr_insert(b, &store->instr);
+               store->num_components = levels[1]->num_components;
         }
+
+       /* Finally, Insert endpatch instruction:
+        *
+        * TODO we should re-work this to use normal flow control.
+        */
+
+       nir_intrinsic_instr *end_patch =
+               nir_intrinsic_instr_create(b->shader, nir_intrinsic_end_patch_ir3);
+       nir_builder_instr_insert(b, &end_patch->instr);
  }
  
+void
+ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader *s, unsigned topology)
+{
+       struct state state = { .topology = topology };
+
+       if (shader_debug_enabled(shader->info.stage)) {
+               fprintf(stderr, "NIR (before tess lowering) for %s shader:\n",
+                               _mesa_shader_stage_to_string(shader->info.stage));
+               nir_print_shader(shader, stderr);
+       }
+
+       build_primitive_map(shader, &state.map, &shader->outputs);
+       memcpy(s->output_loc, state.map.loc, sizeof(s->output_loc));
+       s->output_size = state.map.stride;
+
+       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+       assert(impl);
+
+       nir_builder b;
+       nir_builder_init(&b, impl);
+       b.cursor = nir_before_cf_list(&impl->body);
+
+       state.header = nir_load_tcs_header_ir3(&b);
+
+       nir_foreach_block_safe (block, impl)
+               lower_tess_ctrl_block(block, &b, &state);
+
+       /* Now move the body of the TCS into a conditional:
+        *
+        *   if (gl_InvocationID < num_vertices)
+        *     // body
+        *
+        */
+
+       nir_cf_list body;
+       nir_cf_extract(&body, nir_before_cf_list(&impl->body),
+                                  nir_after_cf_list(&impl->body));
+
+       b.cursor = nir_after_cf_list(&impl->body);
+
+       /* Re-emit the header, since the old one got moved into the if branch */
+       state.header = nir_load_tcs_header_ir3(&b);
+       nir_ssa_def *iid = build_invocation_id(&b, &state);
+
+       const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
+       nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
+
+       nir_if *nif = nir_push_if(&b, cond);
+
+       nir_cf_reinsert(&body, b.cursor);
+
+       b.cursor = nir_after_cf_list(&nif->then_list);
+
+       /* Insert conditional exit for threads invocation id != 0 */
+       nir_ssa_def *iid0_cond = nir_ieq(&b, iid, nir_imm_int(&b, 0));
+       nir_intrinsic_instr *cond_end =
+               nir_intrinsic_instr_create(shader, nir_intrinsic_cond_end_ir3);
+       cond_end->src[0] = nir_src_for_ssa(iid0_cond);
+       nir_builder_instr_insert(&b, &cond_end->instr);
+
+       emit_tess_epilouge(&b, &state);
+
+       nir_pop_if(&b, nif);
+
+       nir_metadata_preserve(impl, 0);
+}
+
+
  static void
-clean_up_split_vars(nir_shader *shader, struct exec_list *list)
+lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
  {
-       uint32_t components[32] = {};
+       nir_foreach_instr_safe (instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                       continue;
+
+               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+               switch (intr->intrinsic) {
+               case nir_intrinsic_load_tess_coord: {
+                       b->cursor = nir_after_instr(&intr->instr);
+                       nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
+                       nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
+                       nir_ssa_def *z;
+
+                       if (state->topology == IR3_TESS_TRIANGLES)
+                               z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
+                       else
+                               z = nir_imm_float(b, 0.0f);
+
+                       nir_ssa_def *coord = nir_vec3(b, x, y, z);
+
+                       nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
+                                       nir_src_for_ssa(coord),
+                                       b->cursor.instr);
+                       break;
+               }
+
+               case nir_intrinsic_load_per_vertex_input: {
+                       // src[] = { vertex, offset }.
  
-       nir_foreach_variable(var, list) {
-               uint32_t mask =
-                       ((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac;
-               components[var->data.driver_location] |= mask;
+                       b->cursor = nir_before_instr(&intr->instr);
+
+                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+                       nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr));
+                       nir_ssa_def *offset = build_per_vertex_offset(b, state,
+                                       intr->src[0].ssa, intr->src[1].ssa, var);
+
+                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
+                       break;
+               }
+
+               case nir_intrinsic_load_tess_level_inner:
+               case nir_intrinsic_load_tess_level_outer: {
+                               b->cursor = nir_before_instr(&intr->instr);
+
+                               gl_varying_slot slot;
+                               if (intr->intrinsic == nir_intrinsic_load_tess_level_inner)
+                                       slot = VARYING_SLOT_TESS_LEVEL_INNER;
+                               else
+                                       slot = VARYING_SLOT_TESS_LEVEL_OUTER;
+
+                               nir_ssa_def *address = nir_load_tess_factor_base_ir3(b);
+                               nir_ssa_def *offset = build_tessfactor_base(b, slot, state);
+
+                               /* Loading across a vec4 (16b) memory boundary is problematic
+                                * if we don't use components from the second vec4.  The tess
+                                * levels aren't guaranteed to be vec4 aligned and we don't
+                                * know which levels are actually used, so we load each
+                                * component individually.
+                                */
+                               nir_ssa_def *levels[4];
+                               for (unsigned i = 0; i < intr->num_components; i++) {
+                                       nir_intrinsic_instr *new_intr =
+                                               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_global_ir3);
+
+                                       new_intr->src[0] = nir_src_for_ssa(address);
+                                       new_intr->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i)));
+                                       new_intr->num_components = 1;
+                                       nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, 1, 32, NULL);
+                                       nir_builder_instr_insert(b, &new_intr->instr);
+                                       levels[i] = &new_intr->dest.ssa;
+                               }
+
+                               nir_ssa_def *v = nir_vec(b, levels, intr->num_components);
+
+                               nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(v));
+
+                               nir_instr_remove(&intr->instr);
+                               break;
+               }
+
+               case nir_intrinsic_load_input: {
+                       // src[] = { offset }.
+
+                       nir_variable *var = get_var(&b->shader->inputs, nir_intrinsic_base(intr));
+
+                       debug_assert(var->data.patch);
+
+                       b->cursor = nir_before_instr(&intr->instr);
+
+                       nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
+                       nir_ssa_def *offset = build_patch_offset(b, state, intr->src[0].ssa, var);
+
+                       replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
+                       break;
+               }
+
+               default:
+                       break;
+               }
         }
+}
+
+void
+ir3_nir_lower_tess_eval(nir_shader *shader, unsigned topology)
+{
+       struct state state = { .topology = topology };
+
+       if (shader_debug_enabled(shader->info.stage)) {
+               fprintf(stderr, "NIR (before tess lowering) for %s shader:\n",
+                               _mesa_shader_stage_to_string(shader->info.stage));
+               nir_print_shader(shader, stderr);
+       }
+
+       /* Build map of inputs so we have the sizes. */
+       build_primitive_map(shader, &state.map, &shader->inputs);
  
-       nir_foreach_variable_safe(var, list) {
-               uint32_t mask =
-                       ((1 << glsl_get_components(glsl_without_array(var->type))) - 1) << var->data.location_frac;
-               bool subset =
-                       (components[var->data.driver_location] | mask) != mask;
-               if (subset)
-                       exec_node_remove(&var->node);
+       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+       assert(impl);
+
+       nir_builder b;
+       nir_builder_init(&b, impl);
+
+       nir_foreach_block_safe (block, impl)
+               lower_tess_eval_block(block, &b, &state);
+
+       nir_metadata_preserve(impl, 0);
+}
+
+static void
+lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
+{
+       nir_foreach_instr_safe (instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                       continue;
+
+               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+               switch (intr->intrinsic) {
+               case nir_intrinsic_end_primitive: {
+                       b->cursor = nir_before_instr(&intr->instr);
+                       nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
+                       nir_instr_remove(&intr->instr);
+                       break;
+               }
+
+               case nir_intrinsic_emit_vertex: {
+                       /* Load the vertex count */
+                       b->cursor = nir_before_instr(&intr->instr);
+                       nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
+
+                       nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
+
+                       foreach_two_lists(dest_node, &state->emit_outputs, src_node, &state->old_outputs) {
+                               nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
+                               nir_variable *src = exec_node_data(nir_variable, src_node, node);
+                               nir_copy_var(b, dest, src);
+                       }
+
+                       nir_instr_remove(&intr->instr);
+
+                       nir_store_var(b, state->emitted_vertex_var,
+                                       nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1);
+
+                       nir_pop_if(b, NULL);
+
+                       /* Increment the vertex count by 1 */
+                       nir_store_var(b, state->vertex_count_var,
+                                       nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
+                       nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
+
+                       break;
+               }
+
+               default:
+                       break;
+               }
         }
  }
  
  void
-ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
+ir3_nir_lower_gs(nir_shader *shader)
  {
         struct state state = { };
  
@@ -382,21 +854,17 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
                 nir_print_shader(shader, stderr);
         }
  
-       clean_up_split_vars(shader, &shader->inputs);
-       clean_up_split_vars(shader, &shader->outputs);
-
         build_primitive_map(shader, &state.map, &shader->inputs);
  
-       uint32_t loc = 0;
-       nir_foreach_variable(var, &shader->outputs) {
-               uint32_t end = var->data.driver_location + glsl_count_attribute_slots(var->type, false);
-               loc = MAX2(loc, end);
-       }
-
+       /* Create an output var for vertex_flags. This will be shadowed below,
+        * same way regular outputs get shadowed, and this variable will become a
+        * temporary.
+        */
         state.vertex_flags_out = nir_variable_create(shader, nir_var_shader_out,
                         glsl_uint_type(), "vertex_flags");
-       state.vertex_flags_out->data.driver_location = loc;
+       state.vertex_flags_out->data.driver_location = shader->num_outputs++;
         state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
+       state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
  
         nir_function_impl *impl = nir_shader_get_entrypoint(shader);
         assert(impl);
@@ -407,27 +875,50 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
  
         state.header = nir_load_gs_header_ir3(&b);
  
-       nir_foreach_variable(var, &shader->outputs) {
-               state.output_vars[var->data.driver_location] = 
-                       nir_local_variable_create(impl, var->type,
-                                       ralloc_asprintf(var, "%s:gs-temp", var->name));
+       /* Generate two set of shadow vars for the output variables.  The first
+        * set replaces the real outputs and the second set (emit_outputs) we'll
+        * assign in the emit_vertex conditionals.  Then at the end of the shader
+        * we copy the emit_outputs to the real outputs, so that we get
+        * store_output in uniform control flow.
+        */
+       exec_list_move_nodes_to(&shader->outputs, &state.old_outputs);
+       exec_list_make_empty(&state.emit_outputs);
+       nir_foreach_variable(var, &state.old_outputs) {
+               /* Create a new output var by cloning the original output var and
+                * stealing the name.
+                */
+               nir_variable *output = nir_variable_clone(var, shader);
+               exec_list_push_tail(&shader->outputs, &output->node);
+
+               /* Rewrite the original output to be a shadow variable. */
+               var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
+               var->data.mode = nir_var_shader_temp;
+
+               /* Clone the shadow variable to create the emit shadow variable that
+                * we'll assign in the emit conditionals.
+                */
+               nir_variable *emit_output = nir_variable_clone(var, shader);
+               emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
+               exec_list_push_tail(&state.emit_outputs, &emit_output->node);
         }
  
+       /* During the shader we'll keep track of which vertex we're currently
+        * emitting for the EmitVertex test and how many vertices we emitted so we
+        * know to discard if didn't emit any.  In most simple shaders, this can
+        * all be statically determined and gets optimized away.
+        */
         state.vertex_count_var =
                 nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
         state.emitted_vertex_var =
                 nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
-       state.vertex_flags_var =
-               nir_local_variable_create(impl, glsl_uint_type(), "vertex_flags");
-       state.vertex_flags_out = state.output_vars[state.vertex_flags_out->data.driver_location];
  
-       /* initialize to 0 */
+       /* Initialize to 0. */
         b.cursor = nir_before_cf_list(&impl->body);
         nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
         nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
-       nir_store_var(&b, state.vertex_flags_var, nir_imm_int(&b, 4), 0x1);
+       nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
  
-       nir_foreach_block_safe(block, impl)
+       nir_foreach_block_safe (block, impl)
                 lower_gs_block(block, &b, &state);
  
         set_foreach(impl->end_block->predecessors, block_entry) {
@@ -443,13 +934,62 @@ ir3_nir_lower_gs(nir_shader *shader, struct ir3_shader *s)
  
                 nir_builder_instr_insert(&b, &discard_if->instr);
  
-               emit_store_outputs(&b, &state);
+               foreach_two_lists(dest_node, &shader->outputs, src_node, &state.emit_outputs) {
+                       nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
+                       nir_variable *src = exec_node_data(nir_variable, src_node, node);
+                       nir_copy_var(&b, dest, src);
+               }
         }
  
+       exec_list_append(&shader->globals, &state.old_outputs);
+       exec_list_append(&shader->globals, &state.emit_outputs);
+
         nir_metadata_preserve(impl, 0);
  
+       nir_lower_global_vars_to_local(shader);
+       nir_split_var_copies(shader);
+       nir_lower_var_copies(shader);
+
+       nir_fixup_deref_modes(shader);
+
         if (shader_debug_enabled(shader->info.stage)) {
                 fprintf(stderr, "NIR (after gs lowering):\n");
                 nir_print_shader(shader, stderr);
         }
  }
+
+uint32_t
+ir3_link_geometry_stages(const struct ir3_shader_variant *producer,
+               const struct ir3_shader_variant *consumer,
+               uint32_t *locs)
+{
+       uint32_t num_loc = 0, factor;
+
+       switch (consumer->type) {
+       case MESA_SHADER_TESS_CTRL:
+       case MESA_SHADER_GEOMETRY:
+               /* These stages load with ldlw, which expects byte offsets. */
+               factor = 4;
+               break;
+       case MESA_SHADER_TESS_EVAL:
+               /* The tess eval shader uses ldg, which takes dword offsets. */
+               factor = 1;
+               break;
+       default:
+               unreachable("bad shader stage");
+       }
+
+       nir_foreach_variable(in_var, &consumer->shader->nir->inputs) {
+               nir_foreach_variable(out_var, &producer->shader->nir->outputs) {
+                       if (in_var->data.location == out_var->data.location) {
+                               locs[in_var->data.driver_location] =
+                                       producer->shader->output_loc[out_var->data.driver_location] * factor;
+
+                               debug_assert(num_loc <= in_var->data.driver_location + 1);
+                               num_loc = in_var->data.driver_location + 1;
+                       }
+               }
+       }
+
+       return num_loc;
+}