gallivm: add base instance sysval support
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
index e5d0293b8f9cb53a6f7ee84d8ad720797d84a673..a4606a67615fa7bc7036b733c5dd862f60daf01e 100644 (file)
@@ -41,6 +41,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_info.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_init.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_misc.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_coro.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_limits.h"
 #include "lp_bld_sample.h"
 #include "lp_bld_struct.h"
 
-/* SM 4.0 says that subroutines can nest 32 deep and 
- * we need one more for our main function */
-#define LP_MAX_NUM_FUNCS 33
-
 #define DUMP_GS_EMITS 0
 
 /*
@@ -95,17 +94,13 @@ emit_dump_reg(struct gallivm_state *gallivm,
 {
    char buf[32];
 
-   util_snprintf(buf, sizeof buf, "    %s[%u].%c = ",
-                 tgsi_file_name(file),
-                 index, "xyzw"[chan]);
+   snprintf(buf, sizeof buf, "    %s[%u].%c = ",
+            tgsi_file_name(file),
+            index, "xyzw"[chan]);
 
    lp_build_print_value(gallivm, buf, value);
 }
 
-/*
- * Return the context for the current function.
- * (always 'main', if shader doesn't do any function calls)
- */
 static inline struct function_ctx *
 func_ctx(struct lp_exec_mask *mask)
 {
@@ -115,386 +110,32 @@ func_ctx(struct lp_exec_mask *mask)
 }
 
 /*
- * Returns true if we're in a loop.
- * It's global, meaning that it returns true even if there's
- * no loop inside the current function, but we were inside
- * a loop inside another function, from which this one was called.
+ * combine the execution mask if there is one with the current mask.
  */
-static inline boolean
-mask_has_loop(struct lp_exec_mask *mask)
-{
-   int i;
-   for (i = mask->function_stack_size - 1; i >= 0; --i) {
-      const struct function_ctx *ctx = &mask->function_stack[i];
-      if (ctx->loop_stack_size > 0)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-/*
- * Returns true if we're inside a switch statement.
- * It's global, meaning that it returns true even if there's
- * no switch in the current function, but we were inside
- * a switch inside another function, from which this one was called.
- */
-static inline boolean
-mask_has_switch(struct lp_exec_mask *mask)
-{
-   int i;
-   for (i = mask->function_stack_size - 1; i >= 0; --i) {
-      const struct function_ctx *ctx = &mask->function_stack[i];
-      if (ctx->switch_stack_size > 0)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-/*
- * Returns true if we're inside a conditional.
- * It's global, meaning that it returns true even if there's
- * no conditional in the current function, but we were inside
- * a conditional inside another function, from which this one was called.
- */
-static inline boolean
-mask_has_cond(struct lp_exec_mask *mask)
-{
-   int i;
-   for (i = mask->function_stack_size - 1; i >= 0; --i) {
-      const struct function_ctx *ctx = &mask->function_stack[i];
-      if (ctx->cond_stack_size > 0)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-
-/*
- * Initialize a function context at the specified index.
- */
-static void
-lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx)
-{
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx =  &mask->function_stack[function_idx];
-
-   ctx->cond_stack_size = 0;
-   ctx->loop_stack_size = 0;
-   ctx->switch_stack_size = 0;
-
-   if (function_idx == 0) {
-      ctx->ret_mask = mask->ret_mask;
-   }
-
-   ctx->loop_limiter = lp_build_alloca(mask->bld->gallivm,
-                                       int_type, "looplimiter");
-   LLVMBuildStore(
-      builder,
-      LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false),
-      ctx->loop_limiter);
-}
-
-static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
-{
-   mask->bld = bld;
-   mask->has_mask = FALSE;
-   mask->ret_in_main = FALSE;
-   /* For the main function */
-   mask->function_stack_size = 1;
-
-   mask->int_vec_type = lp_build_int_vec_type(bld->gallivm, mask->bld->type);
-   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask =
-         mask->cond_mask = mask->switch_mask =
-         LLVMConstAllOnes(mask->int_vec_type);
-
-   mask->function_stack = CALLOC(LP_MAX_NUM_FUNCS,
-                                 sizeof(mask->function_stack[0]));
-   lp_exec_mask_function_init(mask, 0);
-}
-
-static void
-lp_exec_mask_fini(struct lp_exec_mask *mask)
-{
-   FREE(mask->function_stack);
-}
-
-static void lp_exec_mask_update(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   boolean has_loop_mask = mask_has_loop(mask);
-   boolean has_cond_mask = mask_has_cond(mask);
-   boolean has_switch_mask = mask_has_switch(mask);
-   boolean has_ret_mask = mask->function_stack_size > 1 ||
-         mask->ret_in_main;
-
-   if (has_loop_mask) {
-      /*for loops we need to update the entire mask at runtime */
-      LLVMValueRef tmp;
-      assert(mask->break_mask);
-      tmp = LLVMBuildAnd(builder,
-                         mask->cont_mask,
-                         mask->break_mask,
-                         "maskcb");
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->cond_mask,
-                                     tmp,
-                                     "maskfull");
-   } else
-      mask->exec_mask = mask->cond_mask;
-
-   if (has_switch_mask) {
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->exec_mask,
-                                     mask->switch_mask,
-                                     "switchmask");
-   }
-
-   if (has_ret_mask) {
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->exec_mask,
-                                     mask->ret_mask,
-                                     "callmask");
-   }
-
-   mask->has_mask = (has_cond_mask ||
-                     has_loop_mask ||
-                     has_switch_mask ||
-                     has_ret_mask);
-}
-
-static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
-                                   LLVMValueRef val)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-
-   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) {
-      ctx->cond_stack_size++;
-      return;
-   }
-   if (ctx->cond_stack_size == 0 && mask->function_stack_size == 1) {
-      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
-   }
-   ctx->cond_stack[ctx->cond_stack_size++] = mask->cond_mask;
-   assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = LLVMBuildAnd(builder,
-                                  mask->cond_mask,
-                                  val,
-                                  "");
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-   LLVMValueRef prev_mask;
-   LLVMValueRef inv_mask;
-
-   assert(ctx->cond_stack_size);
-   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING)
-      return;
-   prev_mask = ctx->cond_stack[ctx->cond_stack_size - 1];
-   if (ctx->cond_stack_size == 1 && mask->function_stack_size == 1) {
-      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
-   }
-
-   inv_mask = LLVMBuildNot(builder, mask->cond_mask, "");
-
-   mask->cond_mask = LLVMBuildAnd(builder,
-                                  inv_mask,
-                                  prev_mask, "");
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
-{
-   struct function_ctx *ctx = func_ctx(mask);
-   assert(ctx->cond_stack_size);
-   --ctx->cond_stack_size;
-   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING)
-      return;
-   mask->cond_mask = ctx->cond_stack[ctx->cond_stack_size];
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_bgnloop(struct lp_exec_mask *mask)
+static LLVMValueRef
+mask_vec(struct lp_build_tgsi_context *bld_base)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-
-   if (ctx->loop_stack_size >= LP_MAX_TGSI_NESTING) {
-      ++ctx->loop_stack_size;
-      return;
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_exec_mask *exec_mask = &bld->exec_mask;
+   LLVMValueRef bld_mask = bld->mask ? lp_build_mask_value(bld->mask) : NULL;
+   if (!exec_mask->has_mask) {
+      return bld_mask;
    }
-
-   ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] =
-      ctx->break_type;
-   ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_LOOP;
-
-   ctx->loop_stack[ctx->loop_stack_size].loop_block = ctx->loop_block;
-   ctx->loop_stack[ctx->loop_stack_size].cont_mask = mask->cont_mask;
-   ctx->loop_stack[ctx->loop_stack_size].break_mask = mask->break_mask;
-   ctx->loop_stack[ctx->loop_stack_size].break_var = ctx->break_var;
-   ++ctx->loop_stack_size;
-
-   ctx->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, "");
-   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
-
-   ctx->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop");
-
-   LLVMBuildBr(builder, ctx->loop_block);
-   LLVMPositionBuilderAtEnd(builder, ctx->loop_block);
-
-   mask->break_mask = LLVMBuildLoad(builder, ctx->break_var, "");
-
-   lp_exec_mask_update(mask);
+   if (!bld_mask)
+      return exec_mask->exec_mask;
+   return LLVMBuildAnd(builder, lp_build_mask_value(bld->mask),
+                       exec_mask->exec_mask, "");
 }
 
-static void lp_exec_break(struct lp_exec_mask *mask,
+static void lp_exec_tgsi_break(struct lp_exec_mask *mask,
                           struct lp_build_tgsi_context * bld_base)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-
-   if (ctx->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) {
-      LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                            mask->exec_mask,
-                                            "break");
-
-      mask->break_mask = LLVMBuildAnd(builder,
-                                      mask->break_mask,
-                                      exec_mask, "break_full");
-   }
-   else {
-      unsigned opcode = bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
-      boolean break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
-                              opcode == TGSI_OPCODE_CASE);
-
-
-      if (ctx->switch_in_default) {
-         /*
-          * stop default execution but only if this is an unconditional switch.
-          * (The condition here is not perfect since dead code after break is
-          * allowed but should be sufficient since false negatives are just
-          * unoptimized - so we don't have to pre-evaluate that).
-          */
-         if(break_always && ctx->switch_pc) {
-            bld_base->pc = ctx->switch_pc;
-            return;
-         }
-      }
-
-      if (break_always) {
-         mask->switch_mask = LLVMConstNull(mask->bld->int_vec_type);
-      }
-      else {
-         LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                               mask->exec_mask,
-                                               "break");
-         mask->switch_mask = LLVMBuildAnd(builder,
-                                          mask->switch_mask,
-                                          exec_mask, "break_switch");
-      }
-   }
-
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_continue(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                         mask->exec_mask,
-                                         "");
-
-   mask->cont_mask = LLVMBuildAnd(builder,
-                                  mask->cont_mask,
-                                  exec_mask, "");
-
-   lp_exec_mask_update(mask);
-}
-
-
-static void lp_exec_endloop(struct gallivm_state *gallivm,
-                            struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
-   LLVMBasicBlockRef endloop;
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
-   LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context,
-                                               mask->bld->type.width *
-                                               mask->bld->type.length);
-   LLVMValueRef i1cond, i2cond, icond, limiter;
-
-   assert(mask->break_mask);
-
-   
-   assert(ctx->loop_stack_size);
-   if (ctx->loop_stack_size > LP_MAX_TGSI_NESTING) {
-      --ctx->loop_stack_size;
-      return;
-   }
-
-   /*
-    * Restore the cont_mask, but don't pop
-    */
-   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask;
-   lp_exec_mask_update(mask);
-
-   /*
-    * Unlike the continue mask, the break_mask must be preserved across loop
-    * iterations
-    */
-   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
-
-   /* Decrement the loop limiter */
-   limiter = LLVMBuildLoad(builder, ctx->loop_limiter, "");
-
-   limiter = LLVMBuildSub(
-      builder,
-      limiter,
-      LLVMConstInt(int_type, 1, false),
-      "");
-
-   LLVMBuildStore(builder, limiter, ctx->loop_limiter);
-
-   /* i1cond = (mask != 0) */
-   i1cond = LLVMBuildICmp(
-      builder,
-      LLVMIntNE,
-      LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""),
-      LLVMConstNull(reg_type), "i1cond");
-
-   /* i2cond = (looplimiter > 0) */
-   i2cond = LLVMBuildICmp(
-      builder,
-      LLVMIntSGT,
-      limiter,
-      LLVMConstNull(int_type), "i2cond");
-
-   /* if( i1cond && i2cond ) */
-   icond = LLVMBuildAnd(builder, i1cond, i2cond, "");
-
-   endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop");
-
-   LLVMBuildCondBr(builder,
-                   icond, ctx->loop_block, endloop);
-
-   LLVMPositionBuilderAtEnd(builder, endloop);
-
-   assert(ctx->loop_stack_size);
-   --ctx->loop_stack_size;
-   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask;
-   mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask;
-   ctx->loop_block = ctx->loop_stack[ctx->loop_stack_size].loop_block;
-   ctx->break_var = ctx->loop_stack[ctx->loop_stack_size].break_var;
-   ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size +
-         ctx->switch_stack_size];
-
-   lp_exec_mask_update(mask);
+   enum tgsi_opcode opcode =
+      bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
+   bool break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
+                        opcode == TGSI_OPCODE_CASE);
+   lp_exec_break(mask, &bld_base->pc, break_always);
 }
 
 static void lp_exec_switch(struct lp_exec_mask *mask,
@@ -630,7 +271,7 @@ static boolean default_analyse_is_last(struct lp_exec_mask *mask,
    }
 
    while (pc != ~0u && pc < bld_base->num_instructions) {
-      unsigned opcode = bld_base->instructions[pc].Instruction.Opcode;
+      enum tgsi_opcode opcode = bld_base->instructions[pc].Instruction.Opcode;
       switch (opcode) {
       case TGSI_OPCODE_CASE:
          if (curr_switch_stack == ctx->switch_stack_size) {
@@ -648,6 +289,8 @@ static boolean default_analyse_is_last(struct lp_exec_mask *mask,
          }
          curr_switch_stack--;
          break;
+      default:
+         ; /* nothing */
       }
       pc++;
    }
@@ -700,7 +343,8 @@ static void lp_exec_default(struct lp_exec_mask *mask,
        * which just gets rid of all case statements appearing together with
        * default (or could do switch analysis at switch start time instead).
        */
-      unsigned opcode = bld_base->instructions[bld_base->pc - 1].Instruction.Opcode;
+      enum tgsi_opcode opcode =
+         bld_base->instructions[bld_base->pc - 1].Instruction.Opcode;
       boolean ft_into = (opcode != TGSI_OPCODE_BRK &&
                          opcode != TGSI_OPCODE_SWITCH);
       /*
@@ -722,33 +366,6 @@ static void lp_exec_default(struct lp_exec_mask *mask,
 }
 
 
-/* stores val into an address pointed to by dst_ptr.
- * mask->exec_mask is used to figure out which bits of val
- * should be stored into the address
- * (0 means don't store this bit, 1 means do store).
- */
-static void lp_exec_mask_store(struct lp_exec_mask *mask,
-                               struct lp_build_context *bld_store,
-                               LLVMValueRef val,
-                               LLVMValueRef dst_ptr)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL;
-
-   assert(lp_check_value(bld_store->type, val));
-   assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
-   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
-
-   if (exec_mask) {
-      LLVMValueRef res, dst;
-
-      dst = LLVMBuildLoad(builder, dst_ptr, "");
-      res = lp_build_select(bld_store, exec_mask, val, dst);
-      LLVMBuildStore(builder, res, dst_ptr);
-   } else
-      LLVMBuildStore(builder, val, dst_ptr);
-}
-
 static void lp_exec_mask_call(struct lp_exec_mask *mask,
                               int func,
                               int *pc)
@@ -848,7 +465,14 @@ get_file_ptr(struct lp_build_tgsi_soa_context *bld,
 
    if (bld->indirect_files & (1 << file)) {
       LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan);
-      return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+      if (LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(var_of_array))) == LLVMArrayTypeKind) {
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
+         gep[1] = lindex;
+         return LLVMBuildGEP(builder, var_of_array, gep, 2, "");
+      } else {
+         return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+      }
    }
    else {
       assert(index <= bld->bld_base.info->file_max[file]);
@@ -1047,7 +671,8 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
 static LLVMValueRef
 get_indirect_index(struct lp_build_tgsi_soa_context *bld,
                    unsigned reg_file, unsigned reg_index,
-                   const struct tgsi_ind_register *indirect_reg)
+                   const struct tgsi_ind_register *indirect_reg,
+                   int index_limit)
 {
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
    struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
@@ -1094,9 +719,9 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
     * larger than the declared size but smaller than the buffer size.
     */
    if (reg_file != TGSI_FILE_CONSTANT) {
+      assert(index_limit >= 0);
       max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
-                                         uint_bld->type,
-                                         bld->bld_base.info->file_max[reg_file]);
+                                         uint_bld->type, index_limit);
 
       assert(!uint_bld->type.sign);
       index = lp_build_min(uint_bld, index, max_index);
@@ -1178,7 +803,7 @@ emit_fetch_constant(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld_base->base.gallivm;
@@ -1188,6 +813,7 @@ emit_fetch_constant(
    LLVMValueRef consts_ptr;
    LLVMValueRef num_consts;
    LLVMValueRef res;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    /* XXX: Handle fetching xyzw components as a vector */
    assert(swizzle != ~0u);
@@ -1212,7 +838,8 @@ emit_fetch_constant(
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
 
       /* All fetches are from the same constant buffer, so
        * we need to propagate the size to a vector to do a
@@ -1229,7 +856,7 @@ emit_fetch_constant(
 
       if (tgsi_type_is_64bit(stype)) {
          LLVMValueRef swizzle_vec2;
-         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle + 1);
+         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle_in >> 16);
          index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
          index_vec2 = lp_build_add(uint_bld, index_vec2, swizzle_vec2);
       }
@@ -1244,21 +871,42 @@ emit_fetch_constant(
 
       scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
                                 &index, 1, "");
-      if (stype == TGSI_TYPE_DOUBLE) {
-         LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
-         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
-         bld_broad = &bld_base->dbl_bld;
-      } else if (stype == TGSI_TYPE_UNSIGNED64) {
-         LLVMTypeRef u64ptr_type = LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0);
-         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, u64ptr_type, "");
-         bld_broad = &bld_base->uint64_bld;
-      } else if (stype == TGSI_TYPE_SIGNED64) {
-         LLVMTypeRef i64ptr_type = LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0);
-         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, i64ptr_type, "");
-         bld_broad = &bld_base->int64_bld;
+
+      if (tgsi_type_is_64bit(stype) && ((swizzle_in >> 16) != swizzle + 1)) {
+
+         LLVMValueRef scalar2, scalar2_ptr;
+         LLVMValueRef shuffles[2];
+         index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + (swizzle_in >> 16));
+
+         scalar2_ptr = LLVMBuildGEP(builder, consts_ptr,
+                                    &index, 1, "");
+
+         scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+         scalar2 = LLVMBuildLoad(builder, scalar2_ptr, "");
+         shuffles[0] = lp_build_const_int32(gallivm, 0);
+         shuffles[1] = lp_build_const_int32(gallivm, 1);
+
+         res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
+         res = LLVMBuildInsertElement(builder, res, scalar, shuffles[0], "");
+         res = LLVMBuildInsertElement(builder, res, scalar2, shuffles[1], "");
+      } else {
+        if (stype == TGSI_TYPE_DOUBLE) {
+           LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
+           scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
+           bld_broad = &bld_base->dbl_bld;
+        } else if (stype == TGSI_TYPE_UNSIGNED64) {
+           LLVMTypeRef u64ptr_type = LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0);
+           scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, u64ptr_type, "");
+           bld_broad = &bld_base->uint64_bld;
+        } else if (stype == TGSI_TYPE_SIGNED64) {
+           LLVMTypeRef i64ptr_type = LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0);
+           scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, i64ptr_type, "");
+           bld_broad = &bld_base->int64_bld;
+        }
+        scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+        res = lp_build_broadcast_scalar(bld_broad, scalar);
       }
-      scalar = LLVMBuildLoad(builder, scalar_ptr, "");
-      res = lp_build_broadcast_scalar(bld_broad, scalar);
+
    }
 
    if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE || stype == TGSI_TYPE_SIGNED64 || stype == TGSI_TYPE_UNSIGNED64) {
@@ -1272,9 +920,9 @@ emit_fetch_constant(
 /**
  * Fetch 64-bit values from two separate channels.
  * 64-bit values are stored split across two channels, like xy and zw.
- * This function creates a set of 16 floats,
+ * This function creates a set of vec_length*2 floats,
  * extracts the values from the two channels,
- * puts them in the correct place, then casts to 8 64-bits.
+ * puts them in the correct place, then casts to vec_length 64-bits.
  */
 static LLVMValueRef
 emit_fetch_64bit(
@@ -1289,9 +937,9 @@ emit_fetch_64bit(
    LLVMValueRef res;
    struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
    int i;
-   LLVMValueRef shuffles[16];
+   LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)];
    int len = bld_base->base.type.length * 2;
-   assert(len <= 16);
+   assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32)));
 
    for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
       shuffles[i] = lp_build_const_int32(gallivm, i / 2);
@@ -1307,12 +955,13 @@ emit_fetch_immediate(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res = NULL;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    if (bld->use_immediates_array || reg->Register.Indirect) {
       LLVMValueRef imms_array;
@@ -1329,7 +978,8 @@ emit_fetch_immediate(
          indirect_index = get_indirect_index(bld,
                                              reg->Register.File,
                                              reg->Register.Index,
-                                             &reg->Indirect);
+                                             &reg->Indirect,
+                                             bld->bld_base.info->file_max[reg->Register.File]);
          /*
           * Unlike for other reg classes, adding pixel offsets is unnecessary -
           * immediates are stored as full vectors (FIXME??? - might be better
@@ -1343,26 +993,25 @@ emit_fetch_immediate(
          if (tgsi_type_is_64bit(stype))
             index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
                                               indirect_index,
-                                              swizzle + 1,
+                                              swizzle_in >> 16,
                                               FALSE);
          /* Gather values from the immediate register array */
          res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
       } else {
-         LLVMValueRef lindex = lp_build_const_int32(gallivm,
-                                        reg->Register.Index * 4 + swizzle);
-         LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
-                                                bld->imms_array, &lindex, 1, "");
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(gallivm, 0);
+         gep[1] = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
+         LLVMValueRef imms_ptr = LLVMBuildGEP(builder,
+                                              bld->imms_array, gep, 2, "");
          res = LLVMBuildLoad(builder, imms_ptr, "");
 
          if (tgsi_type_is_64bit(stype)) {
-            LLVMValueRef lindex1;
             LLVMValueRef imms_ptr2;
             LLVMValueRef res2;
-
-            lindex1 = lp_build_const_int32(gallivm,
-                                           reg->Register.Index * 4 + swizzle + 1);
+            gep[1] = lp_build_const_int32(gallivm,
+                                          reg->Register.Index * 4 + (swizzle_in >> 16));
             imms_ptr2 = LLVMBuildGEP(builder,
-                                      bld->imms_array, &lindex1, 1, "");
+                                     bld->imms_array, gep, 2, "");
             res2 = LLVMBuildLoad(builder, imms_ptr2, "");
             res = emit_fetch_64bit(bld_base, stype, res, res2);
          }
@@ -1371,7 +1020,7 @@ emit_fetch_immediate(
    else {
       res = bld->immediates[reg->Register.Index][swizzle];
       if (tgsi_type_is_64bit(stype))
-         res = emit_fetch_64bit(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
+         res = emit_fetch_64bit(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle_in >> 16]);
    }
 
    if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || tgsi_type_is_64bit(stype)) {
@@ -1386,12 +1035,13 @@ emit_fetch_input(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
@@ -1403,7 +1053,8 @@ emit_fetch_input(
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
 
       index_vec = get_soa_array_offsets(&bld_base->uint_bld,
                                         indirect_index,
@@ -1412,7 +1063,7 @@ emit_fetch_input(
       if (tgsi_type_is_64bit(stype)) {
          index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
                                            indirect_index,
-                                           swizzle + 1,
+                                           swizzle_in >> 16,
                                            TRUE);
       }
       /* cast inputs_array pointer to float* */
@@ -1435,7 +1086,7 @@ emit_fetch_input(
             LLVMValueRef res2;
 
             lindex1 = lp_build_const_int32(gallivm,
-                                           reg->Register.Index * 4 + swizzle + 1);
+                                           reg->Register.Index * 4 + (swizzle_in >> 16));
             input_ptr2 = LLVMBuildGEP(builder,
                                       bld->inputs_array, &lindex1, 1, "");
             res2 = LLVMBuildLoad(builder, input_ptr2, "");
@@ -1445,7 +1096,7 @@ emit_fetch_input(
       else {
          res = bld->inputs[reg->Register.Index][swizzle];
          if (tgsi_type_is_64bit(stype))
-            res = emit_fetch_64bit(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
+            res = emit_fetch_64bit(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle_in >> 16]);
       }
    }
 
@@ -1465,7 +1116,7 @@ emit_fetch_gs_input(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
@@ -1473,6 +1124,7 @@ emit_fetch_gs_input(
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef attrib_index = NULL;
    LLVMValueRef vertex_index = NULL;
+   unsigned swizzle = swizzle_in & 0xffff;
    LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle);
    LLVMValueRef res;
 
@@ -1488,24 +1140,38 @@ emit_fetch_gs_input(
    }
 
    if (reg->Register.Indirect) {
+      /*
+       * XXX: this is possibly not quite the right value, since file_max may be
+       * larger than the max attrib index, due to it being the max of declared
+       * inputs AND the max vertices per prim (which is 6 for tri adj).
+       * It should however be safe to use (since we always allocate
+       * PIPE_MAX_SHADER_INPUTS (80) for it, which is overallocated quite a bit).
+       */
+      int index_limit = info->file_max[reg->Register.File];
       attrib_index = get_indirect_index(bld,
                                         reg->Register.File,
                                         reg->Register.Index,
-                                        &reg->Indirect);
+                                        &reg->Indirect,
+                                        index_limit);
    } else {
       attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
    }
 
    if (reg->Dimension.Indirect) {
+      /*
+       * A fixed 6 should do as well (which is what we allocate).
+       */
+      int index_limit = u_vertices_per_prim(info->properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
       vertex_index = get_indirect_index(bld,
                                         reg->Register.File,
                                         reg->Dimension.Index,
-                                        &reg->DimIndirect);
+                                        &reg->DimIndirect,
+                                        index_limit);
    } else {
       vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index);
    }
 
-   res = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
+   res = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
                                     reg->Dimension.Indirect,
                                     vertex_index,
                                     reg->Register.Indirect,
@@ -1514,9 +1180,9 @@ emit_fetch_gs_input(
 
    assert(res);
    if (tgsi_type_is_64bit(stype)) {
-      LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle + 1);
+      LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle_in >> 16);
       LLVMValueRef res2;
-      res2 = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
+      res2 = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
                                         reg->Dimension.Indirect,
                                         vertex_index,
                                         reg->Register.Indirect,
@@ -1538,12 +1204,13 @@ emit_fetch_temporary(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
@@ -1554,7 +1221,8 @@ emit_fetch_temporary(
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
 
       index_vec = get_soa_array_offsets(&bld_base->uint_bld,
                                         indirect_index,
@@ -1563,7 +1231,7 @@ emit_fetch_temporary(
       if (tgsi_type_is_64bit(stype)) {
                index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
                                                   indirect_index,
-                                                  swizzle + 1,
+                                                  swizzle_in >> 16,
                                                   TRUE);
       }
 
@@ -1582,7 +1250,7 @@ emit_fetch_temporary(
       if (tgsi_type_is_64bit(stype)) {
          LLVMValueRef temp_ptr2, res2;
 
-         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
+         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle_in >> 16);
          res2 = LLVMBuildLoad(builder, temp_ptr2, "");
          res = emit_fetch_64bit(bld_base, stype, res, res2);
       }
@@ -1605,7 +1273,7 @@ emit_fetch_system_value(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
@@ -1613,6 +1281,7 @@ emit_fetch_system_value(
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
    enum tgsi_opcode_type atype; // Actual type of the value
+   unsigned swizzle = swizzle_in & 0xffff;
 
    assert(!reg->Register.Indirect);
 
@@ -1637,6 +1306,11 @@ emit_fetch_system_value(
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_BASEINSTANCE:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.base_instance);
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
    case TGSI_SEMANTIC_PRIMID:
       res = bld->system_values.prim_id;
       atype = TGSI_TYPE_UNSIGNED;
@@ -1647,6 +1321,30 @@ emit_fetch_system_value(
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_HELPER_INVOCATION:
+      res = LLVMBuildNot(gallivm->builder, lp_build_mask_value(bld->mask), "");
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_THREAD_ID:
+      res = LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, swizzle, "");
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BLOCK_ID:
+      res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.block_id, lp_build_const_int32(gallivm, swizzle));
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_GRID_SIZE:
+      res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.grid_size, lp_build_const_int32(gallivm, swizzle));
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_FACE:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.front_facing);
+      break;
+
    default:
       assert(!"unexpected semantic in emit_fetch_system_value");
       res = bld_base->base.zero;
@@ -1691,7 +1389,7 @@ emit_fetch_deriv(
 }
 
 /**
- * store an array of 8 64-bit into two arrays of 8 floats
+ * store an array of vec-length 64-bit into two arrays of vec_length floats
  * i.e.
  * value is d0, d1, d2, d3 etc.
  * each 64-bit has high and low pieces x, y
@@ -1710,8 +1408,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
    struct lp_build_context *float_bld = &bld_base->base;
    unsigned i;
    LLVMValueRef temp, temp2;
-   LLVMValueRef shuffles[8];
-   LLVMValueRef shuffles2[8];
+   LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32];
 
    for (i = 0; i < bld_base->base.type.length; i++) {
       shuffles[i] = lp_build_const_int32(gallivm, i * 2);
@@ -1774,7 +1472,8 @@ emit_store_chan(
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
    } else {
       assert(reg->Register.Index <=
                              bld_base->info->file_max[reg->Register.File]);
@@ -1995,12 +1694,13 @@ lp_build_lod_property(
     * constant coords maybe).
     * There's at least hope for sample opcodes as well as size queries.
     */
-   if (reg->Register.File == TGSI_FILE_CONSTANT ||
+   if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ ||
+       reg->Register.File == TGSI_FILE_CONSTANT ||
        reg->Register.File == TGSI_FILE_IMMEDIATE) {
       lod_property = LP_SAMPLER_LOD_SCALAR;
    }
    else if (bld_base->info->processor == PIPE_SHADER_FRAGMENT) {
-      if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+      if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) {
          lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
       }
       else {
@@ -2120,8 +1820,10 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    /* Note lod and especially projected are illegal in a LOT of cases */
    if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
        modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-      if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-          inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ) {
+         lod = bld->bld_base.base.zero;
+      } else if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+                 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
          /* note that shadow cube array with bias/explicit lod does not exist */
          lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
       }
@@ -2137,6 +1839,10 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
    }
 
+   if (sampler_op == LP_SAMPLER_OP_GATHER) {
+      uint32_t comp_val = inst->Src[sampler_reg].Register.SwizzleX;
+      sample_key |= (comp_val << LP_SAMPLER_GATHER_COMP_SHIFT);
+   }
    if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
       oow = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
       oow = lp_build_rcp(&bld->bld_base.base, oow);
@@ -2188,7 +1894,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
        * cases exist in practice.
        */
       if (bld->bld_base.info->processor == PIPE_SHADER_FRAGMENT) {
-         if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+         if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) {
             lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
          }
          else {
@@ -2357,7 +2063,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
        * cases exist in practice.
        */
       if (bld->bld_base.info->processor == PIPE_SHADER_FRAGMENT) {
-         if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+         if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) {
             lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
          }
          else {
@@ -2474,7 +2180,8 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    /* always have lod except for buffers and msaa targets ? */
    if (target != TGSI_TEXTURE_BUFFER &&
        target != TGSI_TEXTURE_2D_MSAA &&
-       target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
+       target != TGSI_TEXTURE_2D_ARRAY_MSAA &&
+       inst->Instruction.Opcode != TGSI_OPCODE_TXF_LZ) {
       sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT;
       explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
@@ -2610,7 +2317,7 @@ near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
    unsigned i;
 
    for (i = 0; i < 5; i++) {
-      unsigned opcode;
+      enum tgsi_opcode opcode;
 
       if (pc + i >= bld->bld_base.info->num_instructions)
          return TRUE;
@@ -2902,8 +2609,21 @@ lp_emit_declaration_soa(
       bld->consts_sizes[idx2D] =
          lp_build_array_get(gallivm, bld->const_sizes_ptr, index2D);
    }
-      break;
+   break;
+   case TGSI_FILE_BUFFER:
+   {
+      unsigned idx = decl->Range.First;
+      LLVMValueRef index = lp_build_const_int32(gallivm, idx);
+      assert(idx < LP_MAX_TGSI_SHADER_BUFFERS);
+      bld->ssbos[idx] =
+         lp_build_array_get(gallivm, bld->ssbo_ptr, index);
+      bld->ssbo_sizes[idx] =
+         lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, index);
 
+   }
+   break;
+   case TGSI_FILE_MEMORY:
+      break;
    default:
       /* don't need to declare other vars */
       break;
@@ -2953,13 +2673,14 @@ void lp_emit_immediate_soa(
       unsigned index = bld->num_immediates;
       struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
       LLVMBuilderRef builder = gallivm->builder;
+      LLVMValueRef gep[2];
+      gep[0] = lp_build_const_int32(gallivm, 0);
 
       assert(bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE));
       for (i = 0; i < 4; ++i ) {
-         LLVMValueRef lindex = lp_build_const_int32(
-                  bld->bld_base.base.gallivm, index * 4 + i);
+         gep[1] = lp_build_const_int32(gallivm, index * 4 + i);
          LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
-                                             bld->imms_array, &lindex, 1, "");
+                                             bld->imms_array, gep, 2, "");
          LLVMBuildStore(builder, imms[i], imm_ptr);
       }
    } else {
@@ -2975,11 +2696,12 @@ void lp_emit_immediate_soa(
          unsigned index = bld->num_immediates;
          struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
          LLVMBuilderRef builder = gallivm->builder;
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(gallivm, 0);
          for (i = 0; i < 4; ++i ) {
-            LLVMValueRef lindex = lp_build_const_int32(
-                     bld->bld_base.base.gallivm, index * 4 + i);
+            gep[1] = lp_build_const_int32(gallivm, index * 4 + i);
             LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
-                                                bld->imms_array, &lindex, 1, "");
+                                                bld->imms_array, gep, 2, "");
             LLVMBuildStore(builder,
                            bld->immediates[index][i],
                            imm_ptr);
@@ -3284,18 +3006,559 @@ sviewinfo_emit(
    emit_size_query(bld, emit_data->inst, emit_data->output, TRUE);
 }
 
-static LLVMValueRef
-mask_vec(struct lp_build_tgsi_context *bld_base)
+static void
+lod_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               FALSE, LP_SAMPLER_OP_LODQ, emit_data->output);
+}
+
+static void target_to_dims_layer(unsigned target,
+                                 unsigned *dims,
+                                 unsigned *layer_coord)
+{
+   *layer_coord = 0;
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_BUFFER:
+      *dims = 1;
+      break;
+   case TGSI_TEXTURE_1D_ARRAY:
+      *layer_coord = 1;
+      *dims = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      *dims = 2;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY:
+      *layer_coord = 2;
+      *dims = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      *dims = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+}
+
+static void
+img_load_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_img_params params;
+   LLVMValueRef coords[5];
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   unsigned dims;
+   unsigned target = emit_data->inst->Memory.Texture;
+   unsigned layer_coord;
+
+   target_to_dims_layer(target, &dims, &layer_coord);
+
+   for (unsigned i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i);
+   }
+   for (unsigned i = dims; i < 5; i++) {
+      coords[i] = coord_undef;
+   }
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, layer_coord);
+
+   memset(&params, 0, sizeof(params));
+
+   params.type = bld->bld_base.base.type;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.outdata = emit_data->output;
+   params.target = tgsi_to_pipe_tex_target(target);
+   params.image_index = emit_data->inst->Src[0].Register.Index;
+   params.img_op = LP_IMG_LOAD;
+   bld->image->emit_op(bld->image,
+                         bld->bld_base.base.gallivm,
+                         &params);
+}
+
+static void
+load_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   struct lp_exec_mask *exec_mask = &bld->exec_mask;
+   const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
+   unsigned buf = bufreg->Register.Index;
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER ||
+          bufreg->Register.File == TGSI_FILE_IMAGE ||
+          bufreg->Register.File == TGSI_FILE_MEMORY ||
+          bufreg->Register.File == TGSI_FILE_CONSTBUF);
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
 
-   if (!exec_mask->has_mask) {
-      return lp_build_mask_value(bld->mask);
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      img_load_emit(action, bld_base, emit_data);
+   } else if (bufreg->Register.File == TGSI_FILE_CONSTBUF) {
+      LLVMValueRef consts_ptr = bld->consts[buf];
+      LLVMValueRef num_consts = bld->consts_sizes[buf];
+
+      LLVMValueRef indirect_index;
+      LLVMValueRef overflow_mask;
+
+      indirect_index = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 0);
+      indirect_index = lp_build_shr_imm(uint_bld, indirect_index, 4);
+
+      /* All fetches are from the same constant buffer, so
+       * we need to propagate the size to a vector to do a
+       * vector comparison */
+      num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
+
+      /* Gather values from the constant buffer */
+      unsigned chan_index;
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         /* Construct a boolean vector telling us which channels
+          * overflow the bound constant buffer */
+         overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
+                                          indirect_index, num_consts);
+
+         /* index_vec = indirect_index * 4 */
+         LLVMValueRef index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec,
+                                  lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         emit_data->output[chan_index] = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, NULL);
+      }
+   } else if (0) {
+      /* for indirect support with ARB_gpu_shader5 */
+   } else {
+      LLVMValueRef index;
+      LLVMValueRef scalar, scalar_ptr;
+      unsigned chan_index;
+
+      index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, 0);
+      index = lp_build_shr_imm(uint_bld, index, 2);
+
+      scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf];
+
+      LLVMValueRef ssbo_limit;
+
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
+
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         LLVMValueRef exec_mask = mask_vec(bld_base);
+         if (!is_shared) {
+            LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
+            exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         }
+
+         LLVMValueRef result = lp_build_alloca(gallivm, uint_bld->vec_type, "");
+         struct lp_build_loop_state loop_state;
+         lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+         struct lp_build_if_state ifthen;
+         LLVMValueRef cond, temp_res;
+
+         loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
+                                              loop_state.counter, "");
+
+         cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+         cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+
+         lp_build_if(&ifthen, gallivm, cond);
+         scalar = lp_build_pointer_get(builder, scalar_ptr, loop_index);
+
+         temp_res = LLVMBuildLoad(builder, result, "");
+         temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
+         LLVMBuildStore(builder, temp_res, result);
+         lp_build_else(&ifthen);
+         temp_res = LLVMBuildLoad(builder, result, "");
+         temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, "");
+         LLVMBuildStore(builder, temp_res, result);
+         lp_build_endif(&ifthen);
+         lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                                NULL, LLVMIntUGE);
+         emit_data->output[chan_index] = LLVMBuildLoad(gallivm->builder, result, "");
+      }
    }
-   return LLVMBuildAnd(builder, lp_build_mask_value(bld->mask),
-                       exec_mask->exec_mask, "");
+}
+
+static void
+img_store_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_img_params params;
+   LLVMValueRef coords[5];
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   unsigned dims;
+   unsigned target = emit_data->inst->Memory.Texture;
+   unsigned layer_coord;
+
+   target_to_dims_layer(target, &dims, &layer_coord);
+   for (unsigned i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, i);
+   }
+   for (unsigned i = dims; i < 5; i++) {
+      coords[i] = coord_undef;
+   }
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, layer_coord);
+   memset(&params, 0, sizeof(params));
+
+   params.type = bld->bld_base.base.type;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.outdata = NULL;
+   params.exec_mask = mask_vec(bld_base);
+   params.target = tgsi_to_pipe_tex_target(target);
+   params.image_index = emit_data->inst->Dst[0].Register.Index;
+   params.img_op = LP_IMG_STORE;
+   for (unsigned i = 0; i < 4; i++)
+      params.indata[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i);
+
+   bld->image->emit_op(bld->image,
+                       bld->bld_base.base.gallivm,
+                       &params);
+}
+
+static void
+store_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   const struct tgsi_full_dst_register *bufreg = &emit_data->inst->Dst[0];
+   unsigned buf = bufreg->Register.Index;
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY);
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
+
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      img_store_emit(action, bld_base, emit_data);
+   } else if (0) {
+
+   } else {
+      LLVMValueRef index;  /* index into the const buffer */
+      LLVMValueRef scalar_ptr;
+      LLVMValueRef value;
+      unsigned chan_index;
+
+      index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, 0);
+      index = lp_build_shr_imm(uint_bld, index, 2);
+
+      scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf];
+
+      LLVMValueRef ssbo_limit;
+
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
+
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, chan_index);
+
+         LLVMValueRef exec_mask = mask_vec(bld_base);
+         if (!is_shared) {
+            LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
+            exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         }
+
+         struct lp_build_loop_state loop_state;
+         lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+         LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, value,
+                                                          loop_state.counter, "");
+         value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, uint_bld->elem_type, "");
+
+         struct lp_build_if_state ifthen;
+         LLVMValueRef cond;
+
+         loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
+                                              loop_state.counter, "");
+
+         cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+         cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+         lp_build_if(&ifthen, gallivm, cond);
+
+         lp_build_pointer_set(builder, scalar_ptr, loop_index, value_ptr);
+
+         lp_build_endif(&ifthen);
+         lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                                NULL, LLVMIntUGE);
+      }
+   }
+}
+
+static void
+resq_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
+
+   unsigned buf = bufreg->Register.Index;
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE);
+
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      unsigned target = emit_data->inst->Memory.Texture;
+      struct lp_sampler_size_query_params params = { 0 };
+      params.int_type = bld->bld_base.int_bld.type;
+      params.texture_unit = buf;
+      params.target = tgsi_to_pipe_tex_target(target);
+      params.context_ptr = bld->context_ptr;
+      params.sizes_out = emit_data->output;
+
+      bld->image->emit_size_query(bld->image,
+                                  bld->bld_base.base.gallivm,
+                                  &params);
+   } else {
+      LLVMValueRef num_ssbo = bld->ssbo_sizes[buf];
+
+      emit_data->output[emit_data->chan] = lp_build_broadcast_scalar(uint_bld, num_ssbo);
+   }
+}
+
+static void
+img_atomic_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data,
+   LLVMAtomicRMWBinOp op)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_img_params params;
+   LLVMValueRef coords[5];
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   unsigned dims;
+   unsigned layer_coord;
+   unsigned target = emit_data->inst->Memory.Texture;
+
+   target_to_dims_layer(target, &dims, &layer_coord);
+
+   for (unsigned i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i);
+   }
+   for (unsigned i = dims; i < 5; i++) {
+      coords[i] = coord_undef;
+   }
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, layer_coord);
+   memset(&params, 0, sizeof(params));
+
+   params.type = bld->bld_base.base.type;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.exec_mask = mask_vec(bld_base);
+   params.image_index = emit_data->inst->Src[0].Register.Index;
+   params.coords = coords;
+   params.target = tgsi_to_pipe_tex_target(target);
+   params.op = op;
+   params.outdata = emit_data->output;
+   params.img_op = (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) ? LP_IMG_ATOMIC_CAS : LP_IMG_ATOMIC;
+
+   for (unsigned i = 0; i < 4; i++)
+      params.indata[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, i);
+   if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+      for (unsigned i = 0; i < 4; i++)
+         params.indata2[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 3, i);
+   }
+   bld->image->emit_op(bld->image,
+                       bld->bld_base.base.gallivm,
+                       &params);
+}
+
+static void
+atomic_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
+
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY);
+   unsigned buf = bufreg->Register.Index;
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
+
+   LLVMAtomicRMWBinOp op;
+   switch (emit_data->inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      op = LLVMAtomicRMWBinOpAdd;
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      op = LLVMAtomicRMWBinOpXchg;
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      op = LLVMAtomicRMWBinOpAnd;
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      op = LLVMAtomicRMWBinOpOr;
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      op = LLVMAtomicRMWBinOpXor;
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      op = LLVMAtomicRMWBinOpUMin;
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      op = LLVMAtomicRMWBinOpUMax;
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      op = LLVMAtomicRMWBinOpMin;
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      op = LLVMAtomicRMWBinOpMax;
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      img_atomic_emit(action, bld_base, emit_data, op);
+   } else if (0) {
+   } else {
+      LLVMValueRef index;  /* index into the const buffer */
+      LLVMValueRef scalar, scalar_ptr;
+      LLVMValueRef value;
+
+      index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, 0);
+      value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, 0);
+
+      index = lp_build_shr_imm(uint_bld, index, 2);
+
+      if (!is_shared) {
+         index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, emit_data->chan));
+         scalar_ptr = bld->ssbos[buf];
+      } else
+         scalar_ptr = bld->shared_ptr;
+
+      LLVMValueRef atom_res = lp_build_alloca(gallivm,
+                                              uint_bld->vec_type, "");
+
+      LLVMValueRef ssbo_limit;
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
+
+      LLVMValueRef exec_mask = mask_vec(bld_base);
+
+      if (!is_shared) {
+         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, index, ssbo_limit);
+         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+      }
+
+      struct lp_build_loop_state loop_state;
+      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+      LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, value,
+                                                       loop_state.counter, "");
+      value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, uint_bld->elem_type, "");
+
+      index = LLVMBuildExtractElement(gallivm->builder, index,
+                                      loop_state.counter, "");
+
+      scalar_ptr = LLVMBuildGEP(builder, scalar_ptr,
+                                &index, 1, "");
+
+      struct lp_build_if_state ifthen;
+      LLVMValueRef cond, temp_res;
+
+      cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+      lp_build_if(&ifthen, gallivm, cond);
+
+      if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+         LLVMValueRef cas_src = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 3, 0);
+         LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, cas_src,
+                                                            loop_state.counter, "");
+         cas_src_ptr = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, uint_bld->elem_type, "");
+         scalar = LLVMBuildAtomicCmpXchg(builder, scalar_ptr, value_ptr,
+                                         cas_src_ptr,
+                                         LLVMAtomicOrderingSequentiallyConsistent,
+                                         LLVMAtomicOrderingSequentiallyConsistent,
+                                         false);
+         scalar = LLVMBuildExtractValue(gallivm->builder, scalar, 0, "");
+      } else {
+         scalar = LLVMBuildAtomicRMW(builder, op,
+                                     scalar_ptr, value_ptr,
+                                     LLVMAtomicOrderingSequentiallyConsistent,
+                                     false);
+      }
+      temp_res = LLVMBuildLoad(builder, atom_res, "");
+      temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
+      LLVMBuildStore(builder, temp_res, atom_res);
+      lp_build_else(&ifthen);
+      temp_res = LLVMBuildLoad(builder, atom_res, "");
+      temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, "");
+      LLVMBuildStore(builder, temp_res, atom_res);
+      lp_build_endif(&ifthen);
+
+      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                             NULL, LLVMIntUGE);
+      emit_data->output[emit_data->chan] = LLVMBuildLoad(gallivm->builder, atom_res, "");
+   }
+}
+
+static void
+barrier_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+
+   LLVMBasicBlockRef resume = lp_build_insert_new_block(gallivm, "resume");
+
+   lp_build_coro_suspend_switch(gallivm, bld->coro, resume, false);
+   LLVMPositionBuilderAtEnd(gallivm->builder, resume);
+}
+
+static void
+membar_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMBuildFence(builder, LLVMAtomicOrderingSequentiallyConsistent, false, "");
 }
 
 static void
@@ -3351,15 +3614,18 @@ emit_vertex(
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 
    if (bld->gs_iface->emit_vertex) {
+      uint32_t imms_idx = emit_data->inst->Src[0].Register.SwizzleX;
+      LLVMValueRef stream_id = bld->immediates[0][imms_idx];
       LLVMValueRef mask = mask_vec(bld_base);
       LLVMValueRef total_emitted_vertices_vec =
          LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
       mask = clamp_mask_to_max_output_vertices(bld, mask,
                                                total_emitted_vertices_vec);
       gather_outputs(bld);
-      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base,
+      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base.base,
                                  bld->outputs,
-                                 total_emitted_vertices_vec);
+                                 total_emitted_vertices_vec,
+                                 stream_id);
       increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr,
                                 mask);
       increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr,
@@ -3389,7 +3655,8 @@ end_primitive_masked(struct lp_build_tgsi_context * bld_base,
          LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, "");
       LLVMValueRef emitted_prims_vec =
          LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
-
+      LLVMValueRef total_emitted_vertices_vec =
+         LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
       LLVMValueRef emitted_mask = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
                                                emitted_vertices_vec,
                                                uint_bld->zero);
@@ -3399,9 +3666,11 @@ end_primitive_masked(struct lp_build_tgsi_context * bld_base,
          executes only on the paths that have unflushed vertices */
       mask = LLVMBuildAnd(builder, mask, emitted_mask, "");
 
-      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base,
+      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base.base,
+                                   total_emitted_vertices_vec,
                                    emitted_vertices_vec,
-                                   emitted_prims_vec);
+                                   emitted_prims_vec,
+                                   mask_vec(bld_base));
 
 #if DUMP_GS_EMITS
       lp_build_print_value(bld->bld_base.base.gallivm,
@@ -3474,7 +3743,7 @@ brk_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   lp_exec_break(&bld->exec_mask, bld_base);
+   lp_exec_tgsi_break(&bld->exec_mask, bld_base);
 }
 
 static void
@@ -3558,7 +3827,7 @@ bgnloop_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   lp_exec_bgnloop(&bld->exec_mask);
+   lp_exec_bgnloop(&bld->exec_mask, true);
 }
 
 static void
@@ -3633,12 +3902,10 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
    struct gallivm_state * gallivm = bld_base->base.gallivm;
 
    if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
-      LLVMValueRef array_size =
-         lp_build_const_int32(gallivm,
-                         bld_base->info->file_max[TGSI_FILE_TEMPORARY] * 4 + 4);
-      bld->temps_array = lp_build_array_alloca(gallivm,
-                                              bld_base->base.vec_type, array_size,
-                                              "temp_array");
+      unsigned array_size = bld_base->info->file_max[TGSI_FILE_TEMPORARY] * 4 + 4;
+      bld->temps_array = lp_build_alloca_undef(gallivm,
+                                               LLVMArrayType(bld_base->base.vec_type, array_size),
+                                               "temp_array");
    }
 
    if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
@@ -3651,11 +3918,9 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
    }
 
    if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
-      LLVMValueRef array_size =
-         lp_build_const_int32(gallivm,
-                         bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4);
-      bld->imms_array = lp_build_array_alloca(gallivm,
-                                              bld_base->base.vec_type, array_size,
+      unsigned array_size = bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4;
+      bld->imms_array = lp_build_alloca_undef(gallivm,
+                                              LLVMArrayType(bld_base->base.vec_type, array_size),
                                               "imms_array");
    }
 
@@ -3748,7 +4013,6 @@ static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
          LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
 
       bld->gs_iface->gs_epilogue(bld->gs_iface,
-                                 &bld->bld_base,
                                  total_emitted_vertices_vec,
                                  emitted_prims_vec);
    } else {
@@ -3759,21 +4023,11 @@ static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
 void
 lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   const struct tgsi_token *tokens,
-                  struct lp_type type,
-                  struct lp_build_mask_context *mask,
-                  LLVMValueRef consts_ptr,
-                  LLVMValueRef const_sizes_ptr,
-                  const struct lp_bld_tgsi_system_values *system_values,
-                  const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
-                  LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
-                  LLVMValueRef context_ptr,
-                  LLVMValueRef thread_data_ptr,
-                  struct lp_build_sampler_soa *sampler,
-                  const struct tgsi_shader_info *info,
-                  const struct lp_build_tgsi_gs_iface *gs_iface)
+                  const struct lp_build_tgsi_params *params,
+                  LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS])
 {
    struct lp_build_tgsi_soa_context bld;
-
+   struct lp_type type = params->type;
    struct lp_type res_type;
 
    assert(type.length <= LP_MAX_VECTOR_LENGTH);
@@ -3806,23 +4060,28 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
       int64_type.width *= 2;
       lp_build_context_init(&bld.bld_base.int64_bld, gallivm, int64_type);
    }
-   bld.mask = mask;
-   bld.inputs = inputs;
+   bld.mask = params->mask;
+   bld.inputs = params->inputs;
    bld.outputs = outputs;
-   bld.consts_ptr = consts_ptr;
-   bld.const_sizes_ptr = const_sizes_ptr;
-   bld.sampler = sampler;
-   bld.bld_base.info = info;
-   bld.indirect_files = info->indirect_files;
-   bld.context_ptr = context_ptr;
-   bld.thread_data_ptr = thread_data_ptr;
+   bld.consts_ptr = params->consts_ptr;
+   bld.const_sizes_ptr = params->const_sizes_ptr;
+   bld.ssbo_ptr = params->ssbo_ptr;
+   bld.ssbo_sizes_ptr = params->ssbo_sizes_ptr;
+   bld.sampler = params->sampler;
+   bld.bld_base.info = params->info;
+   bld.indirect_files = params->info->indirect_files;
+   bld.context_ptr = params->context_ptr;
+   bld.thread_data_ptr = params->thread_data_ptr;
+   bld.image = params->image;
+   bld.shared_ptr = params->shared_ptr;
+   bld.coro = params->coro;
 
    /*
     * If the number of temporaries is rather large then we just
     * allocate them as an array right from the start and treat
     * like indirect temporaries.
     */
-   if (info->file_max[TGSI_FILE_TEMPORARY] >= LP_MAX_INLINED_TEMPS) {
+   if (params->info->file_max[TGSI_FILE_TEMPORARY] >= LP_MAX_INLINED_TEMPS) {
       bld.indirect_files |= (1 << TGSI_FILE_TEMPORARY);
    }
    /*
@@ -3831,7 +4090,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
     * a dynamically allocated array.
     */
    bld.use_immediates_array =
-         (info->file_max[TGSI_FILE_IMMEDIATE] >= LP_MAX_INLINED_IMMEDIATES);
+         (params->info->file_max[TGSI_FILE_IMMEDIATE] >= LP_MAX_INLINED_IMMEDIATES);
    if (bld.use_immediates_array) {
       bld.indirect_files |= (1 << TGSI_FILE_IMMEDIATE);
    }
@@ -3879,9 +4138,11 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_TXB].emit = txb_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXD].emit = txd_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXL].emit = txl_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TEX_LZ].emit = txl_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXF_LZ].emit = txf_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit;
@@ -3898,8 +4159,27 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_L].emit = sample_l_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_GATHER4].emit = gather4_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SVIEWINFO].emit = sviewinfo_emit;
-
-   if (gs_iface) {
+   bld.bld_base.op_actions[TGSI_OPCODE_LOD].emit = lod_emit;
+
+   bld.bld_base.op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_STORE].emit = store_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
+
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
+
+   bld.bld_base.op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_BARRIER].emit = barrier_emit;
+
+   if (params->gs_iface) {
       /* There's no specific value for this because it should always
        * be set, but apps using ext_geometry_shader4 quite often
        * were forgetting so we're using MAX_VERTEX_VARYING from
@@ -3909,13 +4189,13 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
 
       /* inputs are always indirect with gs */
       bld.indirect_files |= (1 << TGSI_FILE_INPUT);
-      bld.gs_iface = gs_iface;
+      bld.gs_iface = params->gs_iface;
       bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_gs_input;
       bld.bld_base.op_actions[TGSI_OPCODE_EMIT].emit = emit_vertex;
       bld.bld_base.op_actions[TGSI_OPCODE_ENDPRIM].emit = end_primitive;
 
       max_output_vertices =
-            info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+         params->info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
       if (!max_output_vertices)
          max_output_vertices = 32;
 
@@ -3926,7 +4206,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
 
    lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.int_bld);
 
-   bld.system_values = *system_values;
+   bld.system_values = *params->system_values;
 
    lp_build_tgsi_llvm(&bld.bld_base, tokens);