gallivm: add base instance sysval support
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
index d6625c422765cd9d811e988e32ffd27d0c127f38..a4606a67615fa7bc7036b733c5dd862f60daf01e 100644 (file)
@@ -60,6 +60,7 @@
 #include "lp_bld_misc.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_coro.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_limits.h"
 #include "lp_bld_sample.h"
 #include "lp_bld_struct.h"
 
-/* SM 4.0 says that subroutines can nest 32 deep and 
- * we need one more for our main function */
-#define LP_MAX_NUM_FUNCS 33
-
 #define DUMP_GS_EMITS 0
 
 /*
@@ -104,10 +101,6 @@ emit_dump_reg(struct gallivm_state *gallivm,
    lp_build_print_value(gallivm, buf, value);
 }
 
-/*
- * Return the context for the current function.
- * (always 'main', if shader doesn't do any function calls)
- */
 static inline struct function_ctx *
 func_ctx(struct lp_exec_mask *mask)
 {
@@ -116,24 +109,6 @@ func_ctx(struct lp_exec_mask *mask)
    return &mask->function_stack[mask->function_stack_size - 1];
 }
 
-/*
- * Returns true if we're in a loop.
- * It's global, meaning that it returns true even if there's
- * no loop inside the current function, but we were inside
- * a loop inside another function, from which this one was called.
- */
-static inline boolean
-mask_has_loop(struct lp_exec_mask *mask)
-{
-   int i;
-   for (i = mask->function_stack_size - 1; i >= 0; --i) {
-      const struct function_ctx *ctx = &mask->function_stack[i];
-      if (ctx->loop_stack_size > 0)
-         return TRUE;
-   }
-   return FALSE;
-}
-
 /*
  * combine the execution mask if there is one with the current mask.
  */
@@ -153,370 +128,14 @@ mask_vec(struct lp_build_tgsi_context *bld_base)
                        exec_mask->exec_mask, "");
 }
 
-/*
- * Returns true if we're inside a switch statement.
- * It's global, meaning that it returns true even if there's
- * no switch in the current function, but we were inside
- * a switch inside another function, from which this one was called.
- */
-static inline boolean
-mask_has_switch(struct lp_exec_mask *mask)
-{
-   int i;
-   for (i = mask->function_stack_size - 1; i >= 0; --i) {
-      const struct function_ctx *ctx = &mask->function_stack[i];
-      if (ctx->switch_stack_size > 0)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-/*
- * Returns true if we're inside a conditional.
- * It's global, meaning that it returns true even if there's
- * no conditional in the current function, but we were inside
- * a conditional inside another function, from which this one was called.
- */
-static inline boolean
-mask_has_cond(struct lp_exec_mask *mask)
-{
-   int i;
-   for (i = mask->function_stack_size - 1; i >= 0; --i) {
-      const struct function_ctx *ctx = &mask->function_stack[i];
-      if (ctx->cond_stack_size > 0)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-
-/*
- * Initialize a function context at the specified index.
- */
-static void
-lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx)
-{
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx =  &mask->function_stack[function_idx];
-
-   ctx->cond_stack_size = 0;
-   ctx->loop_stack_size = 0;
-   ctx->switch_stack_size = 0;
-
-   if (function_idx == 0) {
-      ctx->ret_mask = mask->ret_mask;
-   }
-
-   ctx->loop_limiter = lp_build_alloca(mask->bld->gallivm,
-                                       int_type, "looplimiter");
-   LLVMBuildStore(
-      builder,
-      LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false),
-      ctx->loop_limiter);
-}
-
-static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
-{
-   mask->bld = bld;
-   mask->has_mask = FALSE;
-   mask->ret_in_main = FALSE;
-   /* For the main function */
-   mask->function_stack_size = 1;
-
-   mask->int_vec_type = lp_build_int_vec_type(bld->gallivm, mask->bld->type);
-   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask =
-         mask->cond_mask = mask->switch_mask =
-         LLVMConstAllOnes(mask->int_vec_type);
-
-   mask->function_stack = CALLOC(LP_MAX_NUM_FUNCS,
-                                 sizeof(mask->function_stack[0]));
-   lp_exec_mask_function_init(mask, 0);
-}
-
-static void
-lp_exec_mask_fini(struct lp_exec_mask *mask)
-{
-   FREE(mask->function_stack);
-}
-
-static void lp_exec_mask_update(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   boolean has_loop_mask = mask_has_loop(mask);
-   boolean has_cond_mask = mask_has_cond(mask);
-   boolean has_switch_mask = mask_has_switch(mask);
-   boolean has_ret_mask = mask->function_stack_size > 1 ||
-         mask->ret_in_main;
-
-   if (has_loop_mask) {
-      /*for loops we need to update the entire mask at runtime */
-      LLVMValueRef tmp;
-      assert(mask->break_mask);
-      tmp = LLVMBuildAnd(builder,
-                         mask->cont_mask,
-                         mask->break_mask,
-                         "maskcb");
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->cond_mask,
-                                     tmp,
-                                     "maskfull");
-   } else
-      mask->exec_mask = mask->cond_mask;
-
-   if (has_switch_mask) {
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->exec_mask,
-                                     mask->switch_mask,
-                                     "switchmask");
-   }
-
-   if (has_ret_mask) {
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->exec_mask,
-                                     mask->ret_mask,
-                                     "callmask");
-   }
-
-   mask->has_mask = (has_cond_mask ||
-                     has_loop_mask ||
-                     has_switch_mask ||
-                     has_ret_mask);
-}
-
-static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
-                                   LLVMValueRef val)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-
-   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) {
-      ctx->cond_stack_size++;
-      return;
-   }
-   if (ctx->cond_stack_size == 0 && mask->function_stack_size == 1) {
-      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
-   }
-   ctx->cond_stack[ctx->cond_stack_size++] = mask->cond_mask;
-   assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = LLVMBuildAnd(builder,
-                                  mask->cond_mask,
-                                  val,
-                                  "");
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-   LLVMValueRef prev_mask;
-   LLVMValueRef inv_mask;
-
-   assert(ctx->cond_stack_size);
-   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING)
-      return;
-   prev_mask = ctx->cond_stack[ctx->cond_stack_size - 1];
-   if (ctx->cond_stack_size == 1 && mask->function_stack_size == 1) {
-      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
-   }
-
-   inv_mask = LLVMBuildNot(builder, mask->cond_mask, "");
-
-   mask->cond_mask = LLVMBuildAnd(builder,
-                                  inv_mask,
-                                  prev_mask, "");
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
-{
-   struct function_ctx *ctx = func_ctx(mask);
-   assert(ctx->cond_stack_size);
-   --ctx->cond_stack_size;
-   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING)
-      return;
-   mask->cond_mask = ctx->cond_stack[ctx->cond_stack_size];
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_bgnloop(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-
-   if (ctx->loop_stack_size >= LP_MAX_TGSI_NESTING) {
-      ++ctx->loop_stack_size;
-      return;
-   }
-
-   ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] =
-      ctx->break_type;
-   ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_LOOP;
-
-   ctx->loop_stack[ctx->loop_stack_size].loop_block = ctx->loop_block;
-   ctx->loop_stack[ctx->loop_stack_size].cont_mask = mask->cont_mask;
-   ctx->loop_stack[ctx->loop_stack_size].break_mask = mask->break_mask;
-   ctx->loop_stack[ctx->loop_stack_size].break_var = ctx->break_var;
-   ++ctx->loop_stack_size;
-
-   ctx->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, "");
-   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
-
-   ctx->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop");
-
-   LLVMBuildBr(builder, ctx->loop_block);
-   LLVMPositionBuilderAtEnd(builder, ctx->loop_block);
-
-   mask->break_mask = LLVMBuildLoad(builder, ctx->break_var, "");
-
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_break(struct lp_exec_mask *mask,
+static void lp_exec_tgsi_break(struct lp_exec_mask *mask,
                           struct lp_build_tgsi_context * bld_base)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-
-   if (ctx->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) {
-      LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                            mask->exec_mask,
-                                            "break");
-
-      mask->break_mask = LLVMBuildAnd(builder,
-                                      mask->break_mask,
-                                      exec_mask, "break_full");
-   }
-   else {
-      enum tgsi_opcode opcode =
-         bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
-      boolean break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
-                              opcode == TGSI_OPCODE_CASE);
-
-
-      if (ctx->switch_in_default) {
-         /*
-          * stop default execution but only if this is an unconditional switch.
-          * (The condition here is not perfect since dead code after break is
-          * allowed but should be sufficient since false negatives are just
-          * unoptimized - so we don't have to pre-evaluate that).
-          */
-         if(break_always && ctx->switch_pc) {
-            bld_base->pc = ctx->switch_pc;
-            return;
-         }
-      }
-
-      if (break_always) {
-         mask->switch_mask = LLVMConstNull(mask->bld->int_vec_type);
-      }
-      else {
-         LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                               mask->exec_mask,
-                                               "break");
-         mask->switch_mask = LLVMBuildAnd(builder,
-                                          mask->switch_mask,
-                                          exec_mask, "break_switch");
-      }
-   }
-
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_continue(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                         mask->exec_mask,
-                                         "");
-
-   mask->cont_mask = LLVMBuildAnd(builder,
-                                  mask->cont_mask,
-                                  exec_mask, "");
-
-   lp_exec_mask_update(mask);
-}
-
-
-static void lp_exec_endloop(struct gallivm_state *gallivm,
-                            struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-   LLVMBasicBlockRef endloop;
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
-   LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context,
-                                               mask->bld->type.width *
-                                               mask->bld->type.length);
-   LLVMValueRef i1cond, i2cond, icond, limiter;
-
-   assert(mask->break_mask);
-
-   
-   assert(ctx->loop_stack_size);
-   if (ctx->loop_stack_size > LP_MAX_TGSI_NESTING) {
-      --ctx->loop_stack_size;
-      return;
-   }
-
-   /*
-    * Restore the cont_mask, but don't pop
-    */
-   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask;
-   lp_exec_mask_update(mask);
-
-   /*
-    * Unlike the continue mask, the break_mask must be preserved across loop
-    * iterations
-    */
-   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
-
-   /* Decrement the loop limiter */
-   limiter = LLVMBuildLoad(builder, ctx->loop_limiter, "");
-
-   limiter = LLVMBuildSub(
-      builder,
-      limiter,
-      LLVMConstInt(int_type, 1, false),
-      "");
-
-   LLVMBuildStore(builder, limiter, ctx->loop_limiter);
-
-   /* i1cond = (mask != 0) */
-   i1cond = LLVMBuildICmp(
-      builder,
-      LLVMIntNE,
-      LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""),
-      LLVMConstNull(reg_type), "i1cond");
-
-   /* i2cond = (looplimiter > 0) */
-   i2cond = LLVMBuildICmp(
-      builder,
-      LLVMIntSGT,
-      limiter,
-      LLVMConstNull(int_type), "i2cond");
-
-   /* if( i1cond && i2cond ) */
-   icond = LLVMBuildAnd(builder, i1cond, i2cond, "");
-
-   endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop");
-
-   LLVMBuildCondBr(builder,
-                   icond, ctx->loop_block, endloop);
-
-   LLVMPositionBuilderAtEnd(builder, endloop);
-
-   assert(ctx->loop_stack_size);
-   --ctx->loop_stack_size;
-   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask;
-   mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask;
-   ctx->loop_block = ctx->loop_stack[ctx->loop_stack_size].loop_block;
-   ctx->break_var = ctx->loop_stack[ctx->loop_stack_size].break_var;
-   ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size +
-         ctx->switch_stack_size];
-
-   lp_exec_mask_update(mask);
+   enum tgsi_opcode opcode =
+      bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
+   bool break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
+                        opcode == TGSI_OPCODE_CASE);
+   lp_exec_break(mask, &bld_base->pc, break_always);
 }
 
 static void lp_exec_switch(struct lp_exec_mask *mask,
@@ -747,34 +366,6 @@ static void lp_exec_default(struct lp_exec_mask *mask,
 }
 
 
-/* stores val into an address pointed to by dst_ptr.
- * mask->exec_mask is used to figure out which bits of val
- * should be stored into the address
- * (0 means don't store this bit, 1 means do store).
- */
-static void lp_exec_mask_store(struct lp_exec_mask *mask,
-                               struct lp_build_context *bld_store,
-                               LLVMValueRef val,
-                               LLVMValueRef dst_ptr)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL;
-
-   assert(lp_check_value(bld_store->type, val));
-   assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
-   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val) ||
-          LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(dst_ptr))) == LLVMArrayTypeKind);
-
-   if (exec_mask) {
-      LLVMValueRef res, dst;
-
-      dst = LLVMBuildLoad(builder, dst_ptr, "");
-      res = lp_build_select(bld_store, exec_mask, val, dst);
-      LLVMBuildStore(builder, res, dst_ptr);
-   } else
-      LLVMBuildStore(builder, val, dst_ptr);
-}
-
 static void lp_exec_mask_call(struct lp_exec_mask *mask,
                               int func,
                               int *pc)
@@ -1580,7 +1171,7 @@ emit_fetch_gs_input(
       vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index);
    }
 
-   res = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
+   res = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
                                     reg->Dimension.Indirect,
                                     vertex_index,
                                     reg->Register.Indirect,
@@ -1591,7 +1182,7 @@ emit_fetch_gs_input(
    if (tgsi_type_is_64bit(stype)) {
       LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle_in >> 16);
       LLVMValueRef res2;
-      res2 = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
+      res2 = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
                                         reg->Dimension.Indirect,
                                         vertex_index,
                                         reg->Register.Indirect,
@@ -1690,6 +1281,7 @@ emit_fetch_system_value(
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
    enum tgsi_opcode_type atype; // Actual type of the value
+   unsigned swizzle = swizzle_in & 0xffff;
 
    assert(!reg->Register.Indirect);
 
@@ -1714,6 +1306,11 @@ emit_fetch_system_value(
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_BASEINSTANCE:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.base_instance);
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
    case TGSI_SEMANTIC_PRIMID:
       res = bld->system_values.prim_id;
       atype = TGSI_TYPE_UNSIGNED;
@@ -1729,6 +1326,25 @@ emit_fetch_system_value(
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_THREAD_ID:
+      res = LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, swizzle, "");
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BLOCK_ID:
+      res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.block_id, lp_build_const_int32(gallivm, swizzle));
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_GRID_SIZE:
+      res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.grid_size, lp_build_const_int32(gallivm, swizzle));
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_FACE:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.front_facing);
+      break;
+
    default:
       assert(!"unexpected semantic in emit_fetch_system_value");
       res = bld_base->base.zero;
@@ -2078,7 +1694,8 @@ lp_build_lod_property(
     * constant coords maybe).
     * There's at least hope for sample opcodes as well as size queries.
     */
-   if (reg->Register.File == TGSI_FILE_CONSTANT ||
+   if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ ||
+       reg->Register.File == TGSI_FILE_CONSTANT ||
        reg->Register.File == TGSI_FILE_IMMEDIATE) {
       lod_property = LP_SAMPLER_LOD_SCALAR;
    }
@@ -2203,8 +1820,10 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    /* Note lod and especially projected are illegal in a LOT of cases */
    if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
        modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-      if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-          inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ) {
+         lod = bld->bld_base.base.zero;
+      } else if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+                 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
          /* note that shadow cube array with bias/explicit lod does not exist */
          lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
       }
@@ -2220,6 +1839,10 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
    }
 
+   if (sampler_op == LP_SAMPLER_OP_GATHER) {
+      uint32_t comp_val = inst->Src[sampler_reg].Register.SwizzleX;
+      sample_key |= (comp_val << LP_SAMPLER_GATHER_COMP_SHIFT);
+   }
    if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
       oow = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
       oow = lp_build_rcp(&bld->bld_base.base, oow);
@@ -2557,7 +2180,8 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    /* always have lod except for buffers and msaa targets ? */
    if (target != TGSI_TEXTURE_BUFFER &&
        target != TGSI_TEXTURE_2D_MSAA &&
-       target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
+       target != TGSI_TEXTURE_2D_ARRAY_MSAA &&
+       inst->Instruction.Opcode != TGSI_OPCODE_TXF_LZ) {
       sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT;
       explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
@@ -2998,6 +2622,8 @@ lp_emit_declaration_soa(
 
    }
    break;
+   case TGSI_FILE_MEMORY:
+      break;
    default:
       /* don't need to declare other vars */
       break;
@@ -3476,12 +3102,46 @@ load_emit(
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
    const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
    unsigned buf = bufreg->Register.Index;
-   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE);
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER ||
+          bufreg->Register.File == TGSI_FILE_IMAGE ||
+          bufreg->Register.File == TGSI_FILE_MEMORY ||
+          bufreg->Register.File == TGSI_FILE_CONSTBUF);
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
 
-   if (bufreg->Register.File == TGSI_FILE_IMAGE)
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
       img_load_emit(action, bld_base, emit_data);
-   else if (0) {
+   } else if (bufreg->Register.File == TGSI_FILE_CONSTBUF) {
+      LLVMValueRef consts_ptr = bld->consts[buf];
+      LLVMValueRef num_consts = bld->consts_sizes[buf];
+
+      LLVMValueRef indirect_index;
+      LLVMValueRef overflow_mask;
+
+      indirect_index = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 0);
+      indirect_index = lp_build_shr_imm(uint_bld, indirect_index, 4);
+
+      /* All fetches are from the same constant buffer, so
+       * we need to propagate the size to a vector to do a
+       * vector comparison */
+      num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
+
+      /* Gather values from the constant buffer */
+      unsigned chan_index;
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         /* Construct a boolean vector telling us which channels
+          * overflow the bound constant buffer */
+         overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
+                                          indirect_index, num_consts);
+
+         /* index_vec = indirect_index * 4 */
+         LLVMValueRef index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec,
+                                  lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         emit_data->output[chan_index] = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, NULL);
+      }
+   } else if (0) {
       /* for indirect support with ARB_gpu_shader5 */
    } else {
       LLVMValueRef index;
@@ -3491,19 +3151,23 @@ load_emit(
       index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, 0);
       index = lp_build_shr_imm(uint_bld, index, 2);
 
-      scalar_ptr = bld->ssbos[buf];
+      scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf];
 
       LLVMValueRef ssbo_limit;
 
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
 
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
          LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
 
          LLVMValueRef exec_mask = mask_vec(bld_base);
-         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
-         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         if (!is_shared) {
+            LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
+            exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         }
 
          LLVMValueRef result = lp_build_alloca(gallivm, uint_bld->vec_type, "");
          struct lp_build_loop_state loop_state;
@@ -3590,7 +3254,8 @@ store_emit(
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    const struct tgsi_full_dst_register *bufreg = &emit_data->inst->Dst[0];
    unsigned buf = bufreg->Register.Index;
-   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE);
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY);
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
 
    if (bufreg->Register.File == TGSI_FILE_IMAGE) {
       img_store_emit(action, bld_base, emit_data);
@@ -3605,12 +3270,14 @@ store_emit(
       index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, 0);
       index = lp_build_shr_imm(uint_bld, index, 2);
 
-      scalar_ptr = bld->ssbos[buf];
+      scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf];
 
       LLVMValueRef ssbo_limit;
 
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
 
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
          LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
@@ -3618,8 +3285,10 @@ store_emit(
          value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, chan_index);
 
          LLVMValueRef exec_mask = mask_vec(bld_base);
-         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
-         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         if (!is_shared) {
+            LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
+            exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         }
 
          struct lp_build_loop_state loop_state;
          lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
@@ -3740,8 +3409,9 @@ atomic_emit(
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
 
-   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE);
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY);
    unsigned buf = bufreg->Register.Index;
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
 
    LLVMAtomicRMWBinOp op;
    switch (emit_data->inst->Instruction.Opcode) {
@@ -3791,20 +3461,28 @@ atomic_emit(
       value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, 0);
 
       index = lp_build_shr_imm(uint_bld, index, 2);
-      index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, emit_data->chan));
 
-      scalar_ptr = bld->ssbos[buf];
+      if (!is_shared) {
+         index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, emit_data->chan));
+         scalar_ptr = bld->ssbos[buf];
+      } else
+         scalar_ptr = bld->shared_ptr;
 
       LLVMValueRef atom_res = lp_build_alloca(gallivm,
                                               uint_bld->vec_type, "");
 
       LLVMValueRef ssbo_limit;
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
 
       LLVMValueRef exec_mask = mask_vec(bld_base);
-      LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, index, ssbo_limit);
-      exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+
+      if (!is_shared) {
+         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, index, ssbo_limit);
+         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+      }
 
       struct lp_build_loop_state loop_state;
       lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
@@ -3858,6 +3536,31 @@ atomic_emit(
    }
 }
 
+static void
+barrier_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+
+   LLVMBasicBlockRef resume = lp_build_insert_new_block(gallivm, "resume");
+
+   lp_build_coro_suspend_switch(gallivm, bld->coro, resume, false);
+   LLVMPositionBuilderAtEnd(gallivm->builder, resume);
+}
+
+static void
+membar_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMBuildFence(builder, LLVMAtomicOrderingSequentiallyConsistent, false, "");
+}
+
 static void
 increment_vec_ptr_by_mask(struct lp_build_tgsi_context * bld_base,
                           LLVMValueRef ptr,
@@ -3911,15 +3614,18 @@ emit_vertex(
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 
    if (bld->gs_iface->emit_vertex) {
+      uint32_t imms_idx = emit_data->inst->Src[0].Register.SwizzleX;
+      LLVMValueRef stream_id = bld->immediates[0][imms_idx];
       LLVMValueRef mask = mask_vec(bld_base);
       LLVMValueRef total_emitted_vertices_vec =
          LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
       mask = clamp_mask_to_max_output_vertices(bld, mask,
                                                total_emitted_vertices_vec);
       gather_outputs(bld);
-      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base,
+      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base.base,
                                  bld->outputs,
-                                 total_emitted_vertices_vec);
+                                 total_emitted_vertices_vec,
+                                 stream_id);
       increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr,
                                 mask);
       increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr,
@@ -3949,7 +3655,8 @@ end_primitive_masked(struct lp_build_tgsi_context * bld_base,
          LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, "");
       LLVMValueRef emitted_prims_vec =
          LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
-
+      LLVMValueRef total_emitted_vertices_vec =
+         LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
       LLVMValueRef emitted_mask = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
                                                emitted_vertices_vec,
                                                uint_bld->zero);
@@ -3959,9 +3666,11 @@ end_primitive_masked(struct lp_build_tgsi_context * bld_base,
          executes only on the paths that have unflushed vertices */
       mask = LLVMBuildAnd(builder, mask, emitted_mask, "");
 
-      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base,
+      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base.base,
+                                   total_emitted_vertices_vec,
                                    emitted_vertices_vec,
-                                   emitted_prims_vec);
+                                   emitted_prims_vec,
+                                   mask_vec(bld_base));
 
 #if DUMP_GS_EMITS
       lp_build_print_value(bld->bld_base.base.gallivm,
@@ -4034,7 +3743,7 @@ brk_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   lp_exec_break(&bld->exec_mask, bld_base);
+   lp_exec_tgsi_break(&bld->exec_mask, bld_base);
 }
 
 static void
@@ -4118,7 +3827,7 @@ bgnloop_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   lp_exec_bgnloop(&bld->exec_mask);
+   lp_exec_bgnloop(&bld->exec_mask, true);
 }
 
 static void
@@ -4304,7 +4013,6 @@ static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
          LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
 
       bld->gs_iface->gs_epilogue(bld->gs_iface,
-                                 &bld->bld_base,
                                  total_emitted_vertices_vec,
                                  emitted_prims_vec);
    } else {
@@ -4365,6 +4073,8 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.context_ptr = params->context_ptr;
    bld.thread_data_ptr = params->thread_data_ptr;
    bld.image = params->image;
+   bld.shared_ptr = params->shared_ptr;
+   bld.coro = params->coro;
 
    /*
     * If the number of temporaries is rather large then we just
@@ -4428,9 +4138,11 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_TXB].emit = txb_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXD].emit = txd_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXL].emit = txl_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TEX_LZ].emit = txl_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXF_LZ].emit = txf_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit;
@@ -4464,6 +4176,9 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
 
+   bld.bld_base.op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_BARRIER].emit = barrier_emit;
+
    if (params->gs_iface) {
       /* There's no specific value for this because it should always
        * be set, but apps using ext_geometry_shader4 quite often