gallivm: add base instance sysval support
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
index 5fc47ed155babb973953134807a285f754a931bd..a4606a67615fa7bc7036b733c5dd862f60daf01e 100644 (file)
@@ -1,7 +1,7 @@
 /**************************************************************************
  * 
  * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2007-2008 VMware, Inc.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -19,7 +19,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_strings.h"
 #include "lp_bld_tgsi_action.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_init.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_misc.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_coro.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_limits.h"
 
 #define DUMP_GS_EMITS 0
 
-static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
-{
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(bld->gallivm->context);
-   LLVMBuilderRef builder = bld->gallivm->builder;
-
-   mask->bld = bld;
-   mask->has_mask = FALSE;
-   mask->ret_in_main = FALSE;
-   mask->cond_stack_size = 0;
-   mask->loop_stack_size = 0;
-   mask->call_stack_size = 0;
-   mask->switch_stack_size = 0;
-
-   mask->int_vec_type = lp_build_int_vec_type(bld->gallivm, mask->bld->type);
-   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask =
-         mask->cond_mask = mask->switch_mask =
-         LLVMConstAllOnes(mask->int_vec_type);
-
-   mask->loop_limiter = lp_build_alloca(bld->gallivm, int_type, "looplimiter");
-
-   LLVMBuildStore(
-      builder,
-      LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false),
-      mask->loop_limiter);
-}
-
-static void lp_exec_mask_update(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-
-   if (mask->loop_stack_size) {
-      /*for loops we need to update the entire mask at runtime */
-      LLVMValueRef tmp;
-      assert(mask->break_mask);
-      tmp = LLVMBuildAnd(builder,
-                         mask->cont_mask,
-                         mask->break_mask,
-                         "maskcb");
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->cond_mask,
-                                     tmp,
-                                     "maskfull");
-   } else
-      mask->exec_mask = mask->cond_mask;
-
-   if (mask->switch_stack_size) {
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->exec_mask,
-                                     mask->switch_mask,
-                                     "switchmask");
-   }
-
-   if (mask->call_stack_size || mask->ret_in_main) {
-      mask->exec_mask = LLVMBuildAnd(builder,
-                                     mask->exec_mask,
-                                     mask->ret_mask,
-                                     "callmask");
-   }
-
-   mask->has_mask = (mask->cond_stack_size > 0 ||
-                     mask->loop_stack_size > 0 ||
-                     mask->call_stack_size > 0 ||
-                     mask->switch_stack_size > 0 ||
-                     mask->ret_in_main);
-}
-
-static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
-                                   LLVMValueRef val)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
+/*
+ * If non-zero, the generated LLVM IR will print intermediate results on every TGSI
+ * instruction.
+ *
+ * TODO:
+ * - take execution masks in consideration
+ * - debug control-flow instructions
+ */
+#define DEBUG_EXECUTION 0
 
-   assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING);
-   if (mask->cond_stack_size == 0) {
-      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
-   }
-   mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
-   assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = LLVMBuildAnd(builder,
-                                  mask->cond_mask,
-                                  val,
-                                  "");
-   lp_exec_mask_update(mask);
-}
 
-static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
+/*
+ * Emit code to print a register value.
+ */
+static void
+emit_dump_reg(struct gallivm_state *gallivm,
+              unsigned file,
+              unsigned index,
+              unsigned chan,
+              LLVMValueRef value)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef prev_mask;
-   LLVMValueRef inv_mask;
-
-   assert(mask->cond_stack_size);
-   prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
-   if (mask->cond_stack_size == 1) {
-      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
-   }
+   char buf[32];
 
-   inv_mask = LLVMBuildNot(builder, mask->cond_mask, "");
+   snprintf(buf, sizeof buf, "    %s[%u].%c = ",
+            tgsi_file_name(file),
+            index, "xyzw"[chan]);
 
-   mask->cond_mask = LLVMBuildAnd(builder,
-                                  inv_mask,
-                                  prev_mask, "");
-   lp_exec_mask_update(mask);
+   lp_build_print_value(gallivm, buf, value);
 }
 
-static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
+static inline struct function_ctx *
+func_ctx(struct lp_exec_mask *mask)
 {
-   assert(mask->cond_stack_size);
-   mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
-   lp_exec_mask_update(mask);
+   assert(mask->function_stack_size > 0);
+   assert(mask->function_stack_size <= LP_MAX_NUM_FUNCS);
+   return &mask->function_stack[mask->function_stack_size - 1];
 }
 
-static void lp_exec_bgnloop(struct lp_exec_mask *mask)
+/*
+ * combine the execution mask if there is one with the current mask.
+ */
+static LLVMValueRef
+mask_vec(struct lp_build_tgsi_context *bld_base)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-
-   if (mask->loop_stack_size == 0) {
-      assert(mask->loop_block == NULL);
-      assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type));
-      assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type));
-      assert(mask->break_var == NULL);
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_exec_mask *exec_mask = &bld->exec_mask;
+   LLVMValueRef bld_mask = bld->mask ? lp_build_mask_value(bld->mask) : NULL;
+   if (!exec_mask->has_mask) {
+      return bld_mask;
    }
-
-   assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING);
-
-   mask->break_type_stack[mask->loop_stack_size + mask->switch_stack_size] =
-      mask->break_type;
-   mask->break_type = LP_EXEC_MASK_BREAK_TYPE_LOOP;
-
-   mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block;
-   mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask;
-   mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask;
-   mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var;
-   ++mask->loop_stack_size;
-
-   mask->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, "");
-   LLVMBuildStore(builder, mask->break_mask, mask->break_var);
-
-   mask->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop");
-
-   LLVMBuildBr(builder, mask->loop_block);
-   LLVMPositionBuilderAtEnd(builder, mask->loop_block);
-
-   mask->break_mask = LLVMBuildLoad(builder, mask->break_var, "");
-
-   lp_exec_mask_update(mask);
+   if (!bld_mask)
+      return exec_mask->exec_mask;
+   return LLVMBuildAnd(builder, lp_build_mask_value(bld->mask),
+                       exec_mask->exec_mask, "");
 }
 
-static void lp_exec_break(struct lp_exec_mask *mask,
+static void lp_exec_tgsi_break(struct lp_exec_mask *mask,
                           struct lp_build_tgsi_context * bld_base)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-
-   if (mask->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) {
-      LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                            mask->exec_mask,
-                                            "break");
-
-      mask->break_mask = LLVMBuildAnd(builder,
-                                      mask->break_mask,
-                                      exec_mask, "break_full");
-   }
-   else {
-      unsigned opcode = bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
-      boolean break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
-                              opcode == TGSI_OPCODE_CASE);
-
-
-      if (mask->switch_in_default) {
-         /*
-          * stop default execution but only if this is an unconditional switch.
-          * (The condition here is not perfect since dead code after break is
-          * allowed but should be sufficient since false negatives are just
-          * unoptimized - so we don't have to pre-evaluate that).
-          */
-         if(break_always && mask->switch_pc) {
-            bld_base->pc = mask->switch_pc;
-            return;
-         }
-      }
-
-      if (break_always) {
-         mask->switch_mask = LLVMConstNull(mask->bld->int_vec_type);
-      }
-      else {
-         LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                               mask->exec_mask,
-                                               "break");
-         mask->switch_mask = LLVMBuildAnd(builder,
-                                          mask->switch_mask,
-                                          exec_mask, "break_switch");
-      }
-   }
-
-   lp_exec_mask_update(mask);
+   enum tgsi_opcode opcode =
+      bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
+   bool break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
+                        opcode == TGSI_OPCODE_CASE);
+   lp_exec_break(mask, &bld_base->pc, break_always);
 }
 
-static void lp_exec_break_condition(struct lp_exec_mask *mask,
-                                    LLVMValueRef cond)
+static void lp_exec_switch(struct lp_exec_mask *mask,
+                           LLVMValueRef switchval)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef cond_mask = LLVMBuildAnd(builder,
-                                         mask->exec_mask,
-                                         cond, "cond_mask");
-   cond_mask = LLVMBuildNot(builder, cond_mask, "break_cond");
+   struct function_ctx *ctx = func_ctx(mask);
 
-   if (mask->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) {
-      mask->break_mask = LLVMBuildAnd(builder,
-                                      mask->break_mask,
-                                      cond_mask, "breakc_full");
-   }
-   else {
-      mask->switch_mask = LLVMBuildAnd(builder,
-                                       mask->switch_mask,
-                                       cond_mask, "breakc_switch");
+   if (ctx->switch_stack_size >= LP_MAX_TGSI_NESTING ||
+       ctx->loop_stack_size > LP_MAX_TGSI_NESTING) {
+      ctx->switch_stack_size++;
+      return;
    }
 
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_continue(struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef exec_mask = LLVMBuildNot(builder,
-                                         mask->exec_mask,
-                                         "");
-
-   mask->cont_mask = LLVMBuildAnd(builder,
-                                  mask->cont_mask,
-                                  exec_mask, "");
-
-   lp_exec_mask_update(mask);
-}
-
-
-static void lp_exec_endloop(struct gallivm_state *gallivm,
-                            struct lp_exec_mask *mask)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMBasicBlockRef endloop;
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
-   LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context,
-                                               mask->bld->type.width *
-                                               mask->bld->type.length);
-   LLVMValueRef i1cond, i2cond, icond, limiter;
-
-   assert(mask->break_mask);
-
-   /*
-    * Restore the cont_mask, but don't pop
-    */
-   assert(mask->loop_stack_size);
-   mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask;
-   lp_exec_mask_update(mask);
-
-   /*
-    * Unlike the continue mask, the break_mask must be preserved across loop
-    * iterations
-    */
-   LLVMBuildStore(builder, mask->break_mask, mask->break_var);
-
-   /* Decrement the loop limiter */
-   limiter = LLVMBuildLoad(builder, mask->loop_limiter, "");
-
-   limiter = LLVMBuildSub(
-      builder,
-      limiter,
-      LLVMConstInt(int_type, 1, false),
-      "");
-
-   LLVMBuildStore(builder, limiter, mask->loop_limiter);
+   ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] =
+      ctx->break_type;
+   ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_SWITCH;
 
-   /* i1cond = (mask != 0) */
-   i1cond = LLVMBuildICmp(
-      builder,
-      LLVMIntNE,
-      LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""),
-      LLVMConstNull(reg_type), "i1cond");
+   ctx->switch_stack[ctx->switch_stack_size].switch_mask = mask->switch_mask;
+   ctx->switch_stack[ctx->switch_stack_size].switch_val = ctx->switch_val;
+   ctx->switch_stack[ctx->switch_stack_size].switch_mask_default = ctx->switch_mask_default;
+   ctx->switch_stack[ctx->switch_stack_size].switch_in_default = ctx->switch_in_default;
+   ctx->switch_stack[ctx->switch_stack_size].switch_pc = ctx->switch_pc;
+   ctx->switch_stack_size++;
 
-   /* i2cond = (looplimiter > 0) */
-   i2cond = LLVMBuildICmp(
-      builder,
-      LLVMIntSGT,
-      limiter,
-      LLVMConstNull(int_type), "i2cond");
-
-   /* if( i1cond && i2cond ) */
-   icond = LLVMBuildAnd(builder, i1cond, i2cond, "");
-
-   endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop");
-
-   LLVMBuildCondBr(builder,
-                   icond, mask->loop_block, endloop);
-
-   LLVMPositionBuilderAtEnd(builder, endloop);
-
-   assert(mask->loop_stack_size);
-   --mask->loop_stack_size;
-   mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block;
-   mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask;
-   mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask;
-   mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var;
-   mask->break_type = mask->break_type_stack[mask->loop_stack_size + mask->switch_stack_size];
-
-   lp_exec_mask_update(mask);
-}
-
-static void lp_exec_switch(struct lp_exec_mask *mask,
-                           LLVMValueRef switchval)
-{
-   mask->break_type_stack[mask->loop_stack_size + mask->switch_stack_size] =
-      mask->break_type;
-   mask->break_type = LP_EXEC_MASK_BREAK_TYPE_SWITCH;
-
-   mask->switch_stack[mask->switch_stack_size].switch_val = mask->switch_val;
-   mask->switch_stack[mask->switch_stack_size].switch_mask = mask->switch_mask;
-   mask->switch_stack[mask->switch_stack_size].switch_mask_default = mask->switch_mask_default;
-   mask->switch_stack[mask->switch_stack_size].switch_in_default = mask->switch_in_default;
-   mask->switch_stack[mask->switch_stack_size].switch_pc = mask->switch_pc;
-   mask->switch_stack_size++;
-
-   mask->switch_val = switchval;
    mask->switch_mask = LLVMConstNull(mask->int_vec_type);
-   mask->switch_mask_default = LLVMConstNull(mask->int_vec_type);
-   mask->switch_in_default = false;
-   mask->switch_pc = 0;
+   ctx->switch_val = switchval;
+   ctx->switch_mask_default = LLVMConstNull(mask->int_vec_type);
+   ctx->switch_in_default = false;
+   ctx->switch_pc = 0;
 
    lp_exec_mask_update(mask);
 }
@@ -400,44 +173,50 @@ static void lp_exec_endswitch(struct lp_exec_mask *mask,
                               struct lp_build_tgsi_context * bld_base)
 {
    LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
+
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      ctx->switch_stack_size--;
+      return;
+   }
 
    /* check if there's deferred default if so do it now */
-   if (mask->switch_pc && !mask->switch_in_default) {
+   if (ctx->switch_pc && !ctx->switch_in_default) {
       LLVMValueRef prevmask, defaultmask;
       unsigned tmp_pc;
-      prevmask = mask->switch_stack[mask->switch_stack_size - 1].switch_mask;
-      defaultmask = LLVMBuildNot(builder, mask->switch_mask_default, "sw_default_mask");
+      prevmask = ctx->switch_stack[ctx->switch_stack_size - 1].switch_mask;
+      defaultmask = LLVMBuildNot(builder, ctx->switch_mask_default, "sw_default_mask");
       mask->switch_mask = LLVMBuildAnd(builder, prevmask, defaultmask, "sw_mask");
-      mask->switch_in_default = true;
+      ctx->switch_in_default = true;
 
       lp_exec_mask_update(mask);
 
-      assert(bld_base->instructions[mask->switch_pc - 1].Instruction.Opcode ==
+      assert(bld_base->instructions[ctx->switch_pc - 1].Instruction.Opcode ==
              TGSI_OPCODE_DEFAULT);
 
       tmp_pc = bld_base->pc;
-      bld_base->pc = mask->switch_pc;
+      bld_base->pc = ctx->switch_pc;
       /*
        * re-purpose switch_pc to point to here again, since we stop execution of
        * the deferred default after next break.
        */
-      mask->switch_pc = tmp_pc - 1;
+      ctx->switch_pc = tmp_pc - 1;
 
       return;
    }
 
-   else if (mask->switch_pc && mask->switch_in_default) {
-      assert(bld_base->pc == mask->switch_pc + 1);
+   else if (ctx->switch_pc && ctx->switch_in_default) {
+      assert(bld_base->pc == ctx->switch_pc + 1);
    }
 
-   mask->switch_stack_size--;
-   mask->switch_val = mask->switch_stack[mask->switch_stack_size].switch_val;
-   mask->switch_mask = mask->switch_stack[mask->switch_stack_size].switch_mask;
-   mask->switch_mask_default = mask->switch_stack[mask->switch_stack_size].switch_mask_default;
-   mask->switch_in_default = mask->switch_stack[mask->switch_stack_size].switch_in_default;
-   mask->switch_pc = mask->switch_stack[mask->switch_stack_size].switch_pc;
+   ctx->switch_stack_size--;
+   mask->switch_mask = ctx->switch_stack[ctx->switch_stack_size].switch_mask;
+   ctx->switch_val = ctx->switch_stack[ctx->switch_stack_size].switch_val;
+   ctx->switch_mask_default = ctx->switch_stack[ctx->switch_stack_size].switch_mask_default;
+   ctx->switch_in_default = ctx->switch_stack[ctx->switch_stack_size].switch_in_default;
+   ctx->switch_pc = ctx->switch_stack[ctx->switch_stack_size].switch_pc;
 
-   mask->break_type = mask->break_type_stack[mask->loop_stack_size + mask->switch_stack_size];
+   ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size];
 
    lp_exec_mask_update(mask);
 }
@@ -446,15 +225,20 @@ static void lp_exec_case(struct lp_exec_mask *mask,
                          LLVMValueRef caseval)
 {
    LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
 
    LLVMValueRef casemask, prevmask;
 
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      return;
+   }
+
    /* skipping case mask evaluation here is NOT optional (not in all cases anyway). */
-   if (!mask->switch_in_default) {
-      prevmask = mask->switch_stack[mask->switch_stack_size - 1].switch_mask;
-      casemask = lp_build_cmp(mask->bld, PIPE_FUNC_EQUAL, caseval, mask->switch_val);
-      mask->switch_mask_default = LLVMBuildOr(builder, casemask,
-                                              mask->switch_mask_default, "sw_default_mask");
+   if (!ctx->switch_in_default) {
+      prevmask = ctx->switch_stack[ctx->switch_stack_size - 1].switch_mask;
+      casemask = lp_build_cmp(mask->bld, PIPE_FUNC_EQUAL, caseval, ctx->switch_val);
+      ctx->switch_mask_default = LLVMBuildOr(builder, casemask,
+                                             ctx->switch_mask_default, "sw_default_mask");
       casemask = LLVMBuildOr(builder, casemask, mask->switch_mask, "");
       mask->switch_mask = LLVMBuildAnd(builder, casemask, prevmask, "sw_mask");
 
@@ -474,18 +258,23 @@ static boolean default_analyse_is_last(struct lp_exec_mask *mask,
                                        int *default_pc_start)
 {
    unsigned pc = bld_base->pc;
-   unsigned curr_switch_stack = mask->switch_stack_size;
+   struct function_ctx *ctx = func_ctx(mask);
+   int curr_switch_stack = ctx->switch_stack_size;
+
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      return false;
+   }
 
    /* skip over case statements which are together with default */
    while (bld_base->instructions[pc].Instruction.Opcode == TGSI_OPCODE_CASE) {
       pc++;
    }
 
-   while (pc != -1 && pc < bld_base->num_instructions) {
-      unsigned opcode = bld_base->instructions[pc].Instruction.Opcode;
+   while (pc != ~0u && pc < bld_base->num_instructions) {
+      enum tgsi_opcode opcode = bld_base->instructions[pc].Instruction.Opcode;
       switch (opcode) {
       case TGSI_OPCODE_CASE:
-         if (curr_switch_stack == mask->switch_stack_size) {
+         if (curr_switch_stack == ctx->switch_stack_size) {
             *default_pc_start = pc - 1;
             return false;
          }
@@ -494,12 +283,14 @@ static boolean default_analyse_is_last(struct lp_exec_mask *mask,
          curr_switch_stack++;
          break;
       case TGSI_OPCODE_ENDSWITCH:
-         if (curr_switch_stack == mask->switch_stack_size) {
+         if (curr_switch_stack == ctx->switch_stack_size) {
             *default_pc_start = pc - 1;
             return true;
          }
          curr_switch_stack--;
          break;
+      default:
+         ; /* nothing */
       }
       pc++;
    }
@@ -512,10 +303,15 @@ static void lp_exec_default(struct lp_exec_mask *mask,
                             struct lp_build_tgsi_context * bld_base)
 {
    LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
 
    int default_exec_pc;
    boolean default_is_last;
 
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      return;
+   }
+
    /*
     * This is a messy opcode, because it may not be always at the end and
     * there can be fallthrough in and out of it.
@@ -530,11 +326,11 @@ static void lp_exec_default(struct lp_exec_mask *mask,
     */
    if (default_is_last) {
       LLVMValueRef prevmask, defaultmask;
-      prevmask = mask->switch_stack[mask->switch_stack_size - 1].switch_mask;
-      defaultmask = LLVMBuildNot(builder, mask->switch_mask_default, "sw_default_mask");
+      prevmask = ctx->switch_stack[ctx->switch_stack_size - 1].switch_mask;
+      defaultmask = LLVMBuildNot(builder, ctx->switch_mask_default, "sw_default_mask");
       defaultmask = LLVMBuildOr(builder, defaultmask, mask->switch_mask, "");
       mask->switch_mask = LLVMBuildAnd(builder, prevmask, defaultmask, "sw_mask");
-      mask->switch_in_default = true;
+      ctx->switch_in_default = true;
 
       lp_exec_mask_update(mask);
    }
@@ -547,8 +343,9 @@ static void lp_exec_default(struct lp_exec_mask *mask,
        * which just gets rid of all case statements appearing together with
        * default (or could do switch analysis at switch start time instead).
        */
-      unsigned opcode = bld_base->instructions[bld_base->pc - 1].Instruction.Opcode;
-      boolean ft_into = (opcode != TGSI_OPCODE_BRK ||
+      enum tgsi_opcode opcode =
+         bld_base->instructions[bld_base->pc - 1].Instruction.Opcode;
+      boolean ft_into = (opcode != TGSI_OPCODE_BRK &&
                          opcode != TGSI_OPCODE_SWITCH);
       /*
        * If it is not last statement and there was no fallthrough into it,
@@ -561,7 +358,7 @@ static void lp_exec_default(struct lp_exec_mask *mask,
        * do the same as with the former case, except instead of skipping the code
        * just execute it without updating the mask, then go back and re-execute.
        */
-      mask->switch_pc = bld_base->pc;
+      ctx->switch_pc = bld_base->pc;
       if (!ft_into) {
          bld_base->pc = default_exec_pc;
       }
@@ -569,68 +366,37 @@ static void lp_exec_default(struct lp_exec_mask *mask,
 }
 
 
-/* stores val into an address pointed to by dst_ptr.
- * mask->exec_mask is used to figure out which bits of val
- * should be stored into the address
- * (0 means don't store this bit, 1 means do store).
- */
-static void lp_exec_mask_store(struct lp_exec_mask *mask,
-                               struct lp_build_context *bld_store,
-                               LLVMValueRef pred,
-                               LLVMValueRef val,
-                               LLVMValueRef dst_ptr)
-{
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-
-   assert(lp_check_value(bld_store->type, val));
-   assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
-   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
-
-   /* Mix the predicate and execution mask */
-   if (mask->has_mask) {
-      if (pred) {
-         pred = LLVMBuildAnd(builder, pred, mask->exec_mask, "");
-      } else {
-         pred = mask->exec_mask;
-      }
-   }
-
-   if (pred) {
-      LLVMValueRef res, dst;
-
-      dst = LLVMBuildLoad(builder, dst_ptr, "");
-      res = lp_build_select(bld_store, pred, val, dst);
-      LLVMBuildStore(builder, res, dst_ptr);
-   } else
-      LLVMBuildStore(builder, val, dst_ptr);
-}
-
 static void lp_exec_mask_call(struct lp_exec_mask *mask,
                               int func,
                               int *pc)
 {
-   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
-   mask->call_stack[mask->call_stack_size].pc = *pc;
-   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
-   mask->call_stack_size++;
+   if (mask->function_stack_size >= LP_MAX_NUM_FUNCS) {
+      return;
+   }
+
+   lp_exec_mask_function_init(mask, mask->function_stack_size);
+   mask->function_stack[mask->function_stack_size].pc = *pc;
+   mask->function_stack[mask->function_stack_size].ret_mask = mask->ret_mask;
+   mask->function_stack_size++;
    *pc = func;
 }
 
 static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
 {
    LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
    LLVMValueRef exec_mask;
 
-   if (mask->cond_stack_size == 0 &&
-       mask->loop_stack_size == 0 &&
-       mask->switch_stack_size == 0 &&
-       mask->call_stack_size == 0) {
+   if (ctx->cond_stack_size == 0 &&
+       ctx->loop_stack_size == 0 &&
+       ctx->switch_stack_size == 0 &&
+       mask->function_stack_size == 1) {
       /* returning from main() */
       *pc = -1;
       return;
    }
 
-   if (mask->call_stack_size == 0) {
+   if (mask->function_stack_size == 1) {
       /*
        * This requires special handling since we need to ensure
        * we don't drop the mask even if we have no call stack
@@ -656,14 +422,65 @@ static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
 
 static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
 {
-   assert(mask->call_stack_size);
-   mask->call_stack_size--;
-   *pc = mask->call_stack[mask->call_stack_size].pc;
-   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
+   struct function_ctx *ctx;
+
+   assert(mask->function_stack_size > 1);
+   assert(mask->function_stack_size <= LP_MAX_NUM_FUNCS);
+
+   ctx = func_ctx(mask);
+   mask->function_stack_size--;
+
+   *pc = ctx->pc;
+   mask->ret_mask = ctx->ret_mask;
+
    lp_exec_mask_update(mask);
 }
 
 
+static LLVMValueRef
+get_file_ptr(struct lp_build_tgsi_soa_context *bld,
+             unsigned file,
+             int index,
+             unsigned chan)
+{
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   LLVMValueRef (*array_of_vars)[TGSI_NUM_CHANNELS];
+   LLVMValueRef var_of_array;
+
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+      array_of_vars = bld->temps;
+      var_of_array = bld->temps_array;
+      break;
+   case TGSI_FILE_OUTPUT:
+      array_of_vars = bld->outputs;
+      var_of_array = bld->outputs_array;
+      break;
+   default:
+      assert(0);
+      return NULL;
+   }
+
+   assert(chan < 4);
+
+   if (bld->indirect_files & (1 << file)) {
+      LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan);
+      if (LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(var_of_array))) == LLVMArrayTypeKind) {
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
+         gep[1] = lindex;
+         return LLVMBuildGEP(builder, var_of_array, gep, 2, "");
+      } else {
+         return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+      }
+   }
+   else {
+      assert(index <= bld->bld_base.info->file_max[file]);
+      return array_of_vars[index][chan];
+   }
+}
+
+
 /**
  * Return pointer to a temporary register channel (src or dest).
  * Note that indirect addressing cannot be handled here.
@@ -675,15 +492,7 @@ lp_get_temp_ptr_soa(struct lp_build_tgsi_soa_context *bld,
              unsigned index,
              unsigned chan)
 {
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   assert(chan < 4);
-   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
-      LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan);
-      return LLVMBuildGEP(builder, bld->temps_array, &lindex, 1, "");
-   }
-   else {
-      return bld->temps[index][chan];
-   }
+   return get_file_ptr(bld, TGSI_FILE_TEMPORARY, index, chan);
 }
 
 /**
@@ -697,16 +506,7 @@ lp_get_output_ptr(struct lp_build_tgsi_soa_context *bld,
                unsigned index,
                unsigned chan)
 {
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   assert(chan < 4);
-   if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
-      LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm,
-                                                 index * 4 + chan);
-      return LLVMBuildGEP(builder, bld->outputs_array, &lindex, 1, "");
-   }
-   else {
-      return bld->outputs[index][chan];
-   }
+   return get_file_ptr(bld, TGSI_FILE_OUTPUT, index, chan);
 }
 
 /*
@@ -735,26 +535,84 @@ gather_outputs(struct lp_build_tgsi_soa_context * bld)
  * with a little work.
  */
 static LLVMValueRef
-build_gather(struct lp_build_context *bld,
+build_gather(struct lp_build_tgsi_context *bld_base,
              LLVMValueRef base_ptr,
-             LLVMValueRef indexes)
+             LLVMValueRef indexes,
+             LLVMValueRef overflow_mask,
+             LLVMValueRef indexes2)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef res = bld->undef;
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   struct lp_build_context *bld = &bld_base->base;
+   LLVMValueRef res;
    unsigned i;
 
+   if (indexes2)
+      res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
+   else
+      res = bld->undef;
+   /*
+    * overflow_mask is a vector telling us which channels
+    * in the vector overflowed. We use the overflow behavior for
+    * constant buffers which is defined as:
+    * Out of bounds access to constant buffer returns 0 in all
+    * components. Out of bounds behavior is always with respect
+    * to the size of the buffer bound at that slot.
+    */
+
+   if (overflow_mask) {
+      /*
+       * We avoid per-element control flow here (also due to llvm going crazy,
+       * though I suspect it's better anyway since overflow is likely rare).
+       * Note that since we still fetch from buffers even if num_elements was
+       * zero (in this case we'll fetch from index zero) the jit func callers
+       * MUST provide valid fake constant buffers of size 4x32 (the values do
+       * not matter), otherwise we'd still need (not per element though)
+       * control flow.
+       */
+      indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes);
+      if (indexes2)
+         indexes2 = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes2);
+   }
+
    /*
     * Loop over elements of index_vec, load scalar value, insert it into 'res'.
     */
-   for (i = 0; i < bld->type.length; i++) {
-      LLVMValueRef ii = lp_build_const_int32(bld->gallivm, i);
-      LLVMValueRef index = LLVMBuildExtractElement(builder,
-                                                   indexes, ii, "");
-      LLVMValueRef scalar_ptr = LLVMBuildGEP(builder, base_ptr,
-                                             &index, 1, "gather_ptr");
-      LLVMValueRef scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+   for (i = 0; i < bld->type.length * (indexes2 ? 2 : 1); i++) {
+      LLVMValueRef si, di;
+      LLVMValueRef index;
+      LLVMValueRef scalar_ptr, scalar;
+
+      di = lp_build_const_int32(bld->gallivm, i);
+      if (indexes2)
+         si = lp_build_const_int32(bld->gallivm, i >> 1);
+      else
+         si = di;
+
+      if (indexes2 && (i & 1)) {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes2, si, "");
+      } else {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes, si, "");
+      }
+      scalar_ptr = LLVMBuildGEP(builder, base_ptr,
+                                &index, 1, "gather_ptr");
+      scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+
+      res = LLVMBuildInsertElement(builder, res, scalar, di, "");
+   }
 
-      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
+   if (overflow_mask) {
+      if (indexes2) {
+         res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
+         overflow_mask = LLVMBuildSExt(builder, overflow_mask,
+                                       bld_base->dbl_bld.int_vec_type, "");
+         res = lp_build_select(&bld_base->dbl_bld, overflow_mask,
+                               bld_base->dbl_bld.zero, res);
+      } else
+         res = lp_build_select(bld, overflow_mask, bld->zero, res);
    }
 
    return res;
@@ -769,22 +627,12 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
                   LLVMValueRef base_ptr,
                   LLVMValueRef indexes,
                   LLVMValueRef values,
-                  struct lp_exec_mask *mask,
-                  LLVMValueRef pred)
+                  struct lp_exec_mask *mask)
 {
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    unsigned i;
-
-   /* Mix the predicate and execution mask */
-   if (mask->has_mask) {
-      if (pred) {
-         pred = LLVMBuildAnd(builder, pred, mask->exec_mask, "");
-      }
-      else {
-         pred = mask->exec_mask;
-      }
-   }
+   LLVMValueRef pred = mask->has_mask ? mask->exec_mask : NULL;
 
    /*
     * Loop over elements of index_vec, store scalar value.
@@ -823,7 +671,8 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
 static LLVMValueRef
 get_indirect_index(struct lp_build_tgsi_soa_context *bld,
                    unsigned reg_file, unsigned reg_index,
-                   const struct tgsi_ind_register *indirect_reg)
+                   const struct tgsi_ind_register *indirect_reg,
+                   int index_limit)
 {
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
    struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
@@ -860,12 +709,23 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
 
    index = lp_build_add(uint_bld, base, rel);
 
-   max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
-                                      uint_bld->type,
-                                      bld->bld_base.info->file_max[reg_file]);
+   /*
+    * emit_fetch_constant handles constant buffer overflow so this code
+    * is pointless for them.
+    * Furthermore the D3D10 spec in section 6.5 says:
+    * If the constant buffer bound to a slot is larger than the size
+    * declared in the shader for that slot, implementations are allowed
+    * to return incorrect data (not necessarily 0) for indices that are
+    * larger than the declared size but smaller than the buffer size.
+    */
+   if (reg_file != TGSI_FILE_CONSTANT) {
+      assert(index_limit >= 0);
+      max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
+                                         uint_bld->type, index_limit);
 
-   assert(!uint_bld->type.sign);
-   index = lp_build_min(uint_bld, index, max_index);
+      assert(!uint_bld->type.sign);
+      index = lp_build_min(uint_bld, index, max_index);
+   }
 
    return index;
 }
@@ -887,8 +747,16 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
    case TGSI_TYPE_SIGNED:
       bld_fetch = &bld_base->int_bld;
       break;
-   case TGSI_TYPE_VOID:
    case TGSI_TYPE_DOUBLE:
+      bld_fetch = &bld_base->dbl_bld;
+      break;
+   case TGSI_TYPE_UNSIGNED64:
+      bld_fetch = &bld_base->uint64_bld;
+      break;
+   case TGSI_TYPE_SIGNED64:
+      bld_fetch = &bld_base->int64_bld;
+      break;
+   case TGSI_TYPE_VOID:
    default:
       assert(0);
       bld_fetch = NULL;
@@ -917,7 +785,7 @@ get_soa_array_offsets(struct lp_build_context *uint_bld,
 
    if (need_perelement_offset) {
       LLVMValueRef pixel_offsets;
-      int i;
+      unsigned i;
      /* build pixel offset vector: {0, 1, 2, 3, ...} */
       pixel_offsets = uint_bld->undef;
       for (i = 0; i < uint_bld->type.length; i++) {
@@ -935,19 +803,20 @@ emit_fetch_constant(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    unsigned dimension = 0;
-   LLVMValueRef dimension_index;
    LLVMValueRef consts_ptr;
+   LLVMValueRef num_consts;
    LLVMValueRef res;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    /* XXX: Handle fetching xyzw components as a vector */
-   assert(swizzle != ~0);
+   assert(swizzle != ~0u);
 
    if (reg->Register.Dimension) {
       assert(!reg->Dimension.Indirect);
@@ -955,94 +824,208 @@ emit_fetch_constant(
       assert(dimension < LP_MAX_TGSI_CONST_BUFFERS);
    }
 
-   dimension_index = lp_build_const_int32(gallivm, dimension);
-   consts_ptr = lp_build_array_get(gallivm, bld->consts_ptr, dimension_index);
+   consts_ptr = bld->consts[dimension];
+   num_consts = bld->consts_sizes[dimension];
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
       LLVMValueRef swizzle_vec =
          lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
       LLVMValueRef index_vec;  /* index into the const buffer */
+      LLVMValueRef overflow_mask;
+      LLVMValueRef index_vec2 = NULL;
 
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
+
+      /* All fetches are from the same constant buffer, so
+       * we need to propagate the size to a vector to do a
+       * vector comparison */
+      num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
+      /* Construct a boolean vector telling us which channels
+       * overflow the bound constant buffer */
+      overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
+                                       indirect_index, num_consts);
 
       /* index_vec = indirect_index * 4 + swizzle */
       index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
       index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
 
+      if (tgsi_type_is_64bit(stype)) {
+         LLVMValueRef swizzle_vec2;
+         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle_in >> 16);
+         index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec2 = lp_build_add(uint_bld, index_vec2, swizzle_vec2);
+      }
       /* Gather values from the constant buffer */
-      res = build_gather(&bld_base->base, consts_ptr, index_vec);
+      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, index_vec2);
    }
    else {
       LLVMValueRef index;  /* index into the const buffer */
       LLVMValueRef scalar, scalar_ptr;
-
+      struct lp_build_context *bld_broad = &bld_base->base;
       index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
 
       scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
                                 &index, 1, "");
-      scalar = LLVMBuildLoad(builder, scalar_ptr, "");
-      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
+
+      if (tgsi_type_is_64bit(stype) && ((swizzle_in >> 16) != swizzle + 1)) {
+
+         LLVMValueRef scalar2, scalar2_ptr;
+         LLVMValueRef shuffles[2];
+         index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + (swizzle_in >> 16));
+
+         scalar2_ptr = LLVMBuildGEP(builder, consts_ptr,
+                                    &index, 1, "");
+
+         scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+         scalar2 = LLVMBuildLoad(builder, scalar2_ptr, "");
+         shuffles[0] = lp_build_const_int32(gallivm, 0);
+         shuffles[1] = lp_build_const_int32(gallivm, 1);
+
+         res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
+         res = LLVMBuildInsertElement(builder, res, scalar, shuffles[0], "");
+         res = LLVMBuildInsertElement(builder, res, scalar2, shuffles[1], "");
+      } else {
+        if (stype == TGSI_TYPE_DOUBLE) {
+           LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
+           scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
+           bld_broad = &bld_base->dbl_bld;
+        } else if (stype == TGSI_TYPE_UNSIGNED64) {
+           LLVMTypeRef u64ptr_type = LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0);
+           scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, u64ptr_type, "");
+           bld_broad = &bld_base->uint64_bld;
+        } else if (stype == TGSI_TYPE_SIGNED64) {
+           LLVMTypeRef i64ptr_type = LLVMPointerType(LLVMInt64TypeInContext(gallivm->context), 0);
+           scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, i64ptr_type, "");
+           bld_broad = &bld_base->int64_bld;
+        }
+        scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+        res = lp_build_broadcast_scalar(bld_broad, scalar);
+      }
+
    }
 
-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE || stype == TGSI_TYPE_SIGNED64 || stype == TGSI_TYPE_UNSIGNED64) {
       struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
       res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
+
    return res;
 }
 
+/**
+ * Fetch 64-bit values from two separate channels.
+ * 64-bit values are stored split across two channels, like xy and zw.
+ * This function creates a set of vec_length*2 floats,
+ * extracts the values from the two channels,
+ * puts them in the correct place, then casts to vec_length 64-bits.
+ */
+static LLVMValueRef
+emit_fetch_64bit(
+   struct lp_build_tgsi_context * bld_base,
+   enum tgsi_opcode_type stype,
+   LLVMValueRef input,
+   LLVMValueRef input2)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef res;
+   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+   int i;
+   LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)];
+   int len = bld_base->base.type.length * 2;
+   assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32)));
+
+   for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
+      shuffles[i] = lp_build_const_int32(gallivm, i / 2);
+      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
+   }
+   res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
+
+   return LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
+}
+
 static LLVMValueRef
 emit_fetch_immediate(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res = NULL;
+   unsigned swizzle = swizzle_in & 0xffff;
 
-   if (reg->Register.Indirect) {
-      LLVMValueRef indirect_index;
-      LLVMValueRef index_vec;  /* index into the immediate register array */
+   if (bld->use_immediates_array || reg->Register.Indirect) {
       LLVMValueRef imms_array;
       LLVMTypeRef fptr_type;
 
-      indirect_index = get_indirect_index(bld,
-                                          reg->Register.File,
-                                          reg->Register.Index,
-                                          &reg->Indirect);
-      /*
-       * Unlike for other reg classes, adding pixel offsets is unnecessary -
-       * immediates are stored as full vectors (FIXME??? - might be better
-       * to store them the same as constants) but all elements are the same
-       * in any case.
-       */
-      index_vec = get_soa_array_offsets(&bld_base->uint_bld,
-                                        indirect_index,
-                                        swizzle,
-                                        FALSE);
-
       /* cast imms_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       imms_array = LLVMBuildBitCast(builder, bld->imms_array, fptr_type, "");
 
-      /* Gather values from the immediate register array */
-      res = build_gather(&bld_base->base, imms_array, index_vec);
+      if (reg->Register.Indirect) {
+         LLVMValueRef indirect_index;
+         LLVMValueRef index_vec;  /* index into the immediate register array */
+         LLVMValueRef index_vec2 = NULL;
+         indirect_index = get_indirect_index(bld,
+                                             reg->Register.File,
+                                             reg->Register.Index,
+                                             &reg->Indirect,
+                                             bld->bld_base.info->file_max[reg->Register.File]);
+         /*
+          * Unlike for other reg classes, adding pixel offsets is unnecessary -
+          * immediates are stored as full vectors (FIXME??? - might be better
+          * to store them the same as constants) but all elements are the same
+          * in any case.
+          */
+         index_vec = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           swizzle,
+                                           FALSE);
+         if (tgsi_type_is_64bit(stype))
+            index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                              indirect_index,
+                                              swizzle_in >> 16,
+                                              FALSE);
+         /* Gather values from the immediate register array */
+         res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
+      } else {
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(gallivm, 0);
+         gep[1] = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
+         LLVMValueRef imms_ptr = LLVMBuildGEP(builder,
+                                              bld->imms_array, gep, 2, "");
+         res = LLVMBuildLoad(builder, imms_ptr, "");
+
+         if (tgsi_type_is_64bit(stype)) {
+            LLVMValueRef imms_ptr2;
+            LLVMValueRef res2;
+            gep[1] = lp_build_const_int32(gallivm,
+                                          reg->Register.Index * 4 + (swizzle_in >> 16));
+            imms_ptr2 = LLVMBuildGEP(builder,
+                                     bld->imms_array, gep, 2, "");
+            res2 = LLVMBuildLoad(builder, imms_ptr2, "");
+            res = emit_fetch_64bit(bld_base, stype, res, res2);
+         }
+      }
    }
    else {
       res = bld->immediates[reg->Register.Index][swizzle];
+      if (tgsi_type_is_64bit(stype))
+         res = emit_fetch_64bit(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle_in >> 16]);
    }
 
-   if (stype == TGSI_TYPE_UNSIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
-   } else if (stype == TGSI_TYPE_SIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || tgsi_type_is_64bit(stype)) {
+      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
    return res;
 }
@@ -1052,54 +1035,76 @@ emit_fetch_input(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
       LLVMValueRef index_vec;  /* index into the input reg array */
+      LLVMValueRef index_vec2 = NULL;
       LLVMValueRef inputs_array;
       LLVMTypeRef fptr_type;
 
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
 
       index_vec = get_soa_array_offsets(&bld_base->uint_bld,
                                         indirect_index,
                                         swizzle,
                                         TRUE);
-
+      if (tgsi_type_is_64bit(stype)) {
+         index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           swizzle_in >> 16,
+                                           TRUE);
+      }
       /* cast inputs_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
 
       /* Gather values from the input register array */
-      res = build_gather(&bld_base->base, inputs_array, index_vec);
+      res = build_gather(bld_base, inputs_array, index_vec, NULL, index_vec2);
    } else {
       if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
          LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                         reg->Register.Index * 4 + swizzle);
-         LLVMValueRef input_ptr =  LLVMBuildGEP(builder,
-                                                bld->inputs_array, &lindex, 1, "");
+         LLVMValueRef input_ptr = LLVMBuildGEP(builder,
+                                               bld->inputs_array, &lindex, 1, "");
+
          res = LLVMBuildLoad(builder, input_ptr, "");
+         if (tgsi_type_is_64bit(stype)) {
+            LLVMValueRef lindex1;
+            LLVMValueRef input_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + (swizzle_in >> 16));
+            input_ptr2 = LLVMBuildGEP(builder,
+                                      bld->inputs_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, input_ptr2, "");
+            res = emit_fetch_64bit(bld_base, stype, res, res2);
+         }
       }
       else {
          res = bld->inputs[reg->Register.Index][swizzle];
+         if (tgsi_type_is_64bit(stype))
+            res = emit_fetch_64bit(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle_in >> 16]);
       }
    }
 
    assert(res);
 
-   if (stype == TGSI_TYPE_UNSIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
-   } else if (stype == TGSI_TYPE_SIGNED) {
-      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || tgsi_type_is_64bit(stype)) {
+      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
 
    return res;
@@ -1111,35 +1116,62 @@ emit_fetch_gs_input(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   const struct tgsi_shader_info *info = bld->bld_base.info;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef attrib_index = NULL;
    LLVMValueRef vertex_index = NULL;
+   unsigned swizzle = swizzle_in & 0xffff;
    LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle);
    LLVMValueRef res;
 
+   if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PRIMID) {
+      /* This is really a system value not a regular input */
+      assert(!reg->Register.Indirect);
+      assert(!reg->Dimension.Indirect);
+      res = bld->system_values.prim_id;
+      if (stype != TGSI_TYPE_UNSIGNED && stype != TGSI_TYPE_SIGNED) {
+         res = LLVMBuildBitCast(builder, res, bld_base->base.vec_type, "");
+      }
+      return res;
+   }
+
    if (reg->Register.Indirect) {
+      /*
+       * XXX: this is possibly not quite the right value, since file_max may be
+       * larger than the max attrib index, due to it being the max of declared
+       * inputs AND the max vertices per prim (which is 6 for tri adj).
+       * It should however be safe to use (since we always allocate
+       * PIPE_MAX_SHADER_INPUTS (80) for it, which is overallocated quite a bit).
+       */
+      int index_limit = info->file_max[reg->Register.File];
       attrib_index = get_indirect_index(bld,
                                         reg->Register.File,
                                         reg->Register.Index,
-                                        &reg->Indirect);
+                                        &reg->Indirect,
+                                        index_limit);
    } else {
       attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
    }
-   
+
    if (reg->Dimension.Indirect) {
+      /*
+       * A fixed 6 should do as well (which is what we allocate).
+       */
+      int index_limit = u_vertices_per_prim(info->properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
       vertex_index = get_indirect_index(bld,
                                         reg->Register.File,
                                         reg->Dimension.Index,
-                                        &reg->DimIndirect);
+                                        &reg->DimIndirect,
+                                        index_limit);
    } else {
       vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index);
    }
 
-   res = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
+   res = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
                                     reg->Dimension.Indirect,
                                     vertex_index,
                                     reg->Register.Indirect,
@@ -1147,8 +1179,18 @@ emit_fetch_gs_input(
                                     swizzle_index);
 
    assert(res);
-
-   if (stype == TGSI_TYPE_UNSIGNED) {
+   if (tgsi_type_is_64bit(stype)) {
+      LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle_in >> 16);
+      LLVMValueRef res2;
+      res2 = bld->gs_iface->fetch_input(bld->gs_iface, &bld_base->base,
+                                        reg->Dimension.Indirect,
+                                        vertex_index,
+                                        reg->Register.Indirect,
+                                        attrib_index,
+                                        swizzle_index);
+      assert(res2);
+      res = emit_fetch_64bit(bld_base, stype, res, res2);
+   } else if (stype == TGSI_TYPE_UNSIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
@@ -1162,43 +1204,63 @@ emit_fetch_temporary(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
+   unsigned swizzle = swizzle_in & 0xffff;
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
-      LLVMValueRef index_vec;  /* index into the temp reg array */
+      LLVMValueRef index_vec, index_vec2 = NULL;  /* index into the temp reg array */
       LLVMValueRef temps_array;
       LLVMTypeRef fptr_type;
 
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
 
       index_vec = get_soa_array_offsets(&bld_base->uint_bld,
                                         indirect_index,
                                         swizzle,
                                         TRUE);
+      if (tgsi_type_is_64bit(stype)) {
+               index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                                  indirect_index,
+                                                  swizzle_in >> 16,
+                                                  TRUE);
+      }
 
       /* cast temps_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
 
       /* Gather values from the temporary register array */
-      res = build_gather(&bld_base->base, temps_array, index_vec);
+      res = build_gather(bld_base, temps_array, index_vec, NULL, index_vec2);
    }
    else {
       LLVMValueRef temp_ptr;
       temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
       res = LLVMBuildLoad(builder, temp_ptr, "");
+
+      if (tgsi_type_is_64bit(stype)) {
+         LLVMValueRef temp_ptr2, res2;
+
+         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle_in >> 16);
+         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
+         res = emit_fetch_64bit(bld_base, stype, res, res2);
+      }
    }
 
-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED ||
+       stype == TGSI_TYPE_UNSIGNED ||
+       stype == TGSI_TYPE_DOUBLE ||
+       stype == TGSI_TYPE_SIGNED64 ||
+       stype == TGSI_TYPE_UNSIGNED64) {
       struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
       res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
@@ -1211,7 +1273,7 @@ emit_fetch_system_value(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_src_register * reg,
    enum tgsi_opcode_type stype,
-   unsigned swizzle)
+   unsigned swizzle_in)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
@@ -1219,6 +1281,7 @@ emit_fetch_system_value(
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef res;
    enum tgsi_opcode_type atype; // Actual type of the value
+   unsigned swizzle = swizzle_in & 0xffff;
 
    assert(!reg->Register.Indirect);
 
@@ -1233,11 +1296,55 @@ emit_fetch_system_value(
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_VERTEXID_NOBASE:
+      res = bld->system_values.vertex_id_nobase;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BASEVERTEX:
+      res = bld->system_values.basevertex;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BASEINSTANCE:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.base_instance);
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
    case TGSI_SEMANTIC_PRIMID:
       res = bld->system_values.prim_id;
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_INVOCATIONID:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id);
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_HELPER_INVOCATION:
+      res = LLVMBuildNot(gallivm->builder, lp_build_mask_value(bld->mask), "");
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_THREAD_ID:
+      res = LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, swizzle, "");
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BLOCK_ID:
+      res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.block_id, lp_build_const_int32(gallivm, swizzle));
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_GRID_SIZE:
+      res = lp_build_extract_broadcast(gallivm, lp_type_int_vec(32, 96), bld_base->uint_bld.type, bld->system_values.grid_size, lp_build_const_int32(gallivm, swizzle));
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_FACE:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.front_facing);
+      break;
+
    default:
       assert(!"unexpected semantic in emit_fetch_system_value");
       res = bld_base->base.zero;
@@ -1269,87 +1376,61 @@ emit_fetch_deriv(
    LLVMValueRef *ddx,
    LLVMValueRef *ddy)
 {
-   if(res)
+   if (res)
       *res = src;
 
    /* TODO: use interpolation coeffs for inputs */
 
-   if(ddx)
+   if (ddx)
       *ddx = lp_build_ddx(&bld->bld_base.base, src);
 
-   if(ddy)
+   if (ddy)
       *ddy = lp_build_ddy(&bld->bld_base.base, src);
 }
 
-
 /**
- * Predicate.
+ * store an array of vec-length 64-bit into two arrays of vec_length floats
+ * i.e.
+ * value is d0, d1, d2, d3 etc.
+ * each 64-bit has high and low pieces x, y
+ * so gets stored into the separate channels as:
+ * chan_ptr = d0.x, d1.x, d2.x, d3.x
+ * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
  */
 static void
-emit_fetch_predicate(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst,
-   LLVMValueRef *pred)
+emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
+                      LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
+                      LLVMValueRef value)
 {
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   unsigned index;
-   unsigned char swizzles[4];
-   LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
-   LLVMValueRef value;
-   unsigned chan;
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *float_bld = &bld_base->base;
+   unsigned i;
+   LLVMValueRef temp, temp2;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32];
 
-   if (!inst->Instruction.Predicate) {
-      TGSI_FOR_EACH_CHANNEL( chan ) {
-         pred[chan] = NULL;
-      }
-      return;
+   for (i = 0; i < bld_base->base.type.length; i++) {
+      shuffles[i] = lp_build_const_int32(gallivm, i * 2);
+      shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
    }
 
-   swizzles[0] = inst->Predicate.SwizzleX;
-   swizzles[1] = inst->Predicate.SwizzleY;
-   swizzles[2] = inst->Predicate.SwizzleZ;
-   swizzles[3] = inst->Predicate.SwizzleW;
-
-   index = inst->Predicate.Index;
-   assert(index < LP_MAX_TGSI_PREDS);
-
-   TGSI_FOR_EACH_CHANNEL( chan ) {
-      unsigned swizzle = swizzles[chan];
-
-      /*
-       * Only fetch the predicate register channels that are actually listed
-       * in the swizzles
-       */
-      if (!unswizzled[swizzle]) {
-         value = LLVMBuildLoad(builder,
-                               bld->preds[index][swizzle], "");
-
-         /*
-          * Convert the value to an integer mask.
-          *
-          * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
-          * is needlessly causing two comparisons due to storing the intermediate
-          * result as float vector instead of an integer mask vector.
-          */
-         value = lp_build_compare(bld->bld_base.base.gallivm,
-                                  bld->bld_base.base.type,
-                                  PIPE_FUNC_NOTEQUAL,
-                                  value,
-                                  bld->bld_base.base.zero);
-         if (inst->Predicate.Negate) {
-            value = LLVMBuildNot(builder, value, "");
-         }
-
-         unswizzled[swizzle] = value;
-      } else {
-         value = unswizzled[swizzle];
-      }
+   temp = LLVMBuildShuffleVector(builder, value,
+                                 LLVMGetUndef(LLVMTypeOf(value)),
+                                 LLVMConstVector(shuffles,
+                                                 bld_base->base.type.length),
+                                 "");
+   temp2 = LLVMBuildShuffleVector(builder, value,
+                                  LLVMGetUndef(LLVMTypeOf(value)),
+                                  LLVMConstVector(shuffles2,
+                                                  bld_base->base.type.length),
+                                  "");
 
-      pred[chan] = value;
-   }
+   lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2);
 }
 
-
 /**
  * Register store.
  */
@@ -1359,7 +1440,6 @@ emit_store_chan(
    const struct tgsi_full_instruction *inst,
    unsigned index,
    unsigned chan_index,
-   LLVMValueRef pred,
    LLVMValueRef value)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
@@ -1369,49 +1449,40 @@ emit_store_chan(
    struct lp_build_context *float_bld = &bld_base->base;
    struct lp_build_context *int_bld = &bld_base->int_bld;
    LLVMValueRef indirect_index = NULL;
-   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode, index);
 
    /*
     * Apply saturation.
     *
     * It is always assumed to be float.
     */
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
+   if (inst->Instruction.Saturate) {
       assert(dtype == TGSI_TYPE_FLOAT ||
              dtype == TGSI_TYPE_UNTYPED);
       value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
       value = lp_build_clamp_zero_one_nanzero(float_bld, value);
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert(dtype == TGSI_TYPE_FLOAT ||
-             dtype == TGSI_TYPE_UNTYPED);
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
-      /* This will give -1.0 for NaN which is probably not what we want. */
-      value = lp_build_max_ext(float_bld, value,
-                               lp_build_const_vec(gallivm, float_bld->type, -1.0),
-                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-      value = lp_build_min(float_bld, value, float_bld->one);
-      break;
-
-   default:
-      assert(0);
    }
 
    if (reg->Register.Indirect) {
+      /*
+       * Currently the mesa/st doesn't generate indirect stores
+       * to 64-bit values, it normally uses MOV to do indirect stores.
+       */
+      assert(!tgsi_type_is_64bit(dtype));
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
-                                          &reg->Indirect);
+                                          &reg->Indirect,
+                                          bld->bld_base.info->file_max[reg->Register.File]);
    } else {
       assert(reg->Register.Index <=
                              bld_base->info->file_max[reg->Register.File]);
    }
 
+   if (DEBUG_EXECUTION) {
+      emit_dump_reg(gallivm, reg->Register.File, reg->Register.Index, chan_index, value);
+   }
+
    switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
       /* Outputs are always stored as floats */
@@ -1432,18 +1503,28 @@ emit_store_chan(
 
          /* Scatter store values into output registers */
          emit_mask_scatter(bld, outputs_array, index_vec, value,
-                           &bld->exec_mask, pred);
+                           &bld->exec_mask);
       }
       else {
          LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
                                                   chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
+
+         if (tgsi_type_is_64bit(dtype)) {
+            LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
+                                                      chan_index + 1);
+            emit_store_64bit_chan(bld_base, out_ptr, out_ptr2,
+                                  value);
+         } else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, value, out_ptr);
       }
       break;
 
    case TGSI_FILE_TEMPORARY:
       /* Temporaries are always stored as floats */
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      if (!tgsi_type_is_64bit(dtype))
+         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      else
+         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
 
       if (reg->Register.Indirect) {
          LLVMValueRef index_vec;  /* indexes into the temp registers */
@@ -1460,12 +1541,21 @@ emit_store_chan(
 
          /* Scatter store values into temp registers */
          emit_mask_scatter(bld, temps_array, index_vec, value,
-                           &bld->exec_mask, pred);
+                           &bld->exec_mask);
       }
       else {
          LLVMValueRef temp_ptr;
          temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
+
+         if (tgsi_type_is_64bit(dtype)) {
+            LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld,
+                                                         reg->Register.Index,
+                                                         chan_index + 1);
+            emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2,
+                                  value);
+         }
+         else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, value, temp_ptr);
       }
       break;
 
@@ -1473,17 +1563,10 @@ emit_store_chan(
       assert(dtype == TGSI_TYPE_SIGNED);
       assert(LLVMTypeOf(value) == int_bld->vec_type);
       value = LLVMBuildBitCast(builder, value, int_bld->vec_type, "");
-      lp_exec_mask_store(&bld->exec_mask, int_bld, pred, value,
+      lp_exec_mask_store(&bld->exec_mask, int_bld, value,
                          bld->addr[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_PREDICATE:
-      assert(LLVMTypeOf(value) == float_bld->vec_type);
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
-      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value,
-                         bld->preds[reg->Register.Index][chan_index]);
-      break;
-
    default:
       assert( 0 );
    }
@@ -1491,28 +1574,59 @@ emit_store_chan(
    (void)dtype;
 }
 
+/*
+ * Called at the beginning of the translation of each TGSI instruction, to
+ * emit some debug code.
+ */
 static void
-emit_store(
+emit_debug(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_full_instruction * inst,
-   const struct tgsi_opcode_info * info,
-   LLVMValueRef dst[4])
+   const struct tgsi_opcode_info * info)
 
 {
-   unsigned chan_index;
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   if(info->num_dst) {
-      LLVMValueRef pred[TGSI_NUM_CHANNELS];
+   if (DEBUG_EXECUTION) {
+      /*
+       * Dump the TGSI instruction.
+       */
 
-      emit_fetch_predicate( bld, inst, pred );
+      struct gallivm_state *gallivm = bld_base->base.gallivm;
+      char buf[512];
+      buf[0] = '$';
+      buf[1] = ' ';
+      tgsi_dump_instruction_str(inst, bld_base->pc, &buf[2], sizeof buf - 2);
+      lp_build_printf(gallivm, buf);
 
-      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
+      /* Dump the execution mask.
+       */
+      if (bld->exec_mask.has_mask) {
+         lp_build_print_value(gallivm, "    mask = ", bld->exec_mask.exec_mask);
       }
    }
 }
 
+static void
+emit_store(
+   struct lp_build_tgsi_context * bld_base,
+   const struct tgsi_full_instruction * inst,
+   const struct tgsi_opcode_info * info,
+   unsigned index,
+   LLVMValueRef dst[4])
+
+{
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode, index);
+
+   unsigned writemask = inst->Dst[index].Register.WriteMask;
+   while (writemask) {
+      unsigned chan_index = u_bit_scan(&writemask);
+      if (tgsi_type_is_64bit(dtype) && (chan_index == 1 || chan_index == 3))
+          continue;
+      emit_store_chan(bld_base, inst, index, chan_index, dst[chan_index]);
+   }
+}
+
 static unsigned
 tgsi_to_pipe_tex_target(unsigned tgsi_target)
 {
@@ -1580,12 +1694,13 @@ lp_build_lod_property(
     * constant coords maybe).
     * There's at least hope for sample opcodes as well as size queries.
     */
-   if (reg->Register.File == TGSI_FILE_CONSTANT ||
+   if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ ||
+       reg->Register.File == TGSI_FILE_CONSTANT ||
        reg->Register.File == TGSI_FILE_IMMEDIATE) {
       lod_property = LP_SAMPLER_LOD_SCALAR;
    }
-   else if (bld_base->info->processor == TGSI_PROCESSOR_FRAGMENT) {
-      if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+   else if (bld_base->info->processor == PIPE_SHADER_FRAGMENT) {
+      if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) {
          lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
       }
       else {
@@ -1608,19 +1723,24 @@ static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
           enum lp_build_tex_modifier modifier,
-          LLVMValueRef *texel)
+          LLVMValueRef *texel,
+          unsigned sampler_reg,
+          enum lp_sampler_op_type sampler_op)
 {
-   unsigned unit;
-   LLVMValueRef lod_bias, explicit_lod;
+   unsigned unit = inst->Src[sampler_reg].Register.Index;
    LLVMValueRef oow = NULL;
+   LLVMValueRef lod = NULL;
    LLVMValueRef coords[5];
    LLVMValueRef offsets[3] = { NULL };
    struct lp_derivatives derivs;
-   struct lp_derivatives *deriv_ptr = NULL;
+   struct lp_sampler_params params;
    enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
    unsigned num_derivs, num_offsets, i;
    unsigned shadow_coord = 0;
    unsigned layer_coord = 0;
+   unsigned sample_key = sampler_op << LP_SAMPLER_OP_TYPE_SHIFT;
+
+   memset(&params, 0, sizeof(params));
 
    if (!bld->sampler) {
       _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -1680,7 +1800,16 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       num_derivs = 3;
       break;
    case TGSI_TEXTURE_CUBE_ARRAY:
+      num_offsets = 2;
+      num_derivs = 3;
+      layer_coord = 3;
+      break;
    case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      num_offsets = 2;
+      num_derivs = 3;
+      layer_coord = 3;
+      shadow_coord = 4; /* shadow coord special different reg */
+      break;
    case TGSI_TEXTURE_2D_MSAA:
    case TGSI_TEXTURE_2D_ARRAY_MSAA:
    default:
@@ -1691,22 +1820,29 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    /* Note lod and especially projected are illegal in a LOT of cases */
    if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
        modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-      LLVMValueRef lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+      if (inst->Instruction.Opcode == TGSI_OPCODE_TEX_LZ) {
+         lod = bld->bld_base.base.zero;
+      } else if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+                 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
+         /* note that shadow cube array with bias/explicit lod does not exist */
+         lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+      }
+      else {
+         lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+      }
       if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
-         lod_bias = lod;
-         explicit_lod = NULL;
+         sample_key |= LP_SAMPLER_LOD_BIAS << LP_SAMPLER_LOD_CONTROL_SHIFT;
       }
       else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-         lod_bias = NULL;
-         explicit_lod = lod;
+         sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT;
       }
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
    }
-   else {
-      lod_bias = NULL;
-      explicit_lod = NULL;
-   }
 
+   if (sampler_op == LP_SAMPLER_OP_GATHER) {
+      uint32_t comp_val = inst->Src[sampler_reg].Register.SwizzleX;
+      sample_key |= (comp_val << LP_SAMPLER_GATHER_COMP_SHIFT);
+   }
    if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
       oow = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
       oow = lp_build_rcp(&bld->bld_base.base, oow);
@@ -1723,31 +1859,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 
    /* Layer coord always goes into 3rd slot, except for cube map arrays */
    if (layer_coord) {
-      coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      if (layer_coord == 3) {
+         coords[3] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      }
+      else {
+         coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      }
       if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
          coords[2] = lp_build_mul(&bld->bld_base.base, coords[2], oow);
    }
    /* Shadow coord occupies always 5th slot. */
    if (shadow_coord) {
-      coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 0, shadow_coord);
+      sample_key |= LP_SAMPLER_SHADOW;
+      if (shadow_coord == 4) {
+         coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+      }
+      else {
+         coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 0, shadow_coord);
+      }
       if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
          coords[4] = lp_build_mul(&bld->bld_base.base, coords[4], oow);
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
       unsigned dim;
+      sample_key |= LP_SAMPLER_LOD_DERIVATIVES << LP_SAMPLER_LOD_CONTROL_SHIFT;
       for (dim = 0; dim < num_derivs; ++dim) {
          derivs.ddx[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 1, dim);
          derivs.ddy[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 2, dim);
       }
-      deriv_ptr = &derivs;
-      unit = inst->Src[3].Register.Index;
+      params.derivs = &derivs;
       /*
        * could also check all src regs if constant but I doubt such
        * cases exist in practice.
        */
-      if (bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT) {
-         if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+      if (bld->bld_base.info->processor == PIPE_SHADER_FRAGMENT) {
+         if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) {
             lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
          }
          else {
@@ -1757,28 +1904,32 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       else {
          lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
       }
-   } else {
-      unit = inst->Src[1].Register.Index;
    }
+   sample_key |= lod_property << LP_SAMPLER_LOD_PROPERTY_SHIFT;
 
-   /* some advanced gather instructions (txgo) would require 4 offsets */
+   /* we don't handle the 4 offset version of tg4 */
    if (inst->Texture.NumOffsets == 1) {
       unsigned dim;
+      sample_key |= LP_SAMPLER_OFFSETS;
       for (dim = 0; dim < num_offsets; dim++) {
          offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim);
       }
    }
 
-   bld->sampler->emit_fetch_texel(bld->sampler,
-                                  bld->bld_base.base.gallivm,
-                                  bld->bld_base.base.type,
-                                  FALSE,
-                                  unit, unit,
-                                  coords,
-                                  offsets,
-                                  deriv_ptr,
-                                  lod_bias, explicit_lod, lod_property,
-                                  texel);
+   params.type = bld->bld_base.base.type;
+   params.sample_key = sample_key;
+   params.texture_index = unit;
+   params.sampler_index = unit;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.offsets = offsets;
+   params.lod = lod;
+   params.texel = texel;
+
+   bld->sampler->emit_tex_sample(bld->sampler,
+                                 bld->bld_base.base.gallivm,
+                                 &params);
 }
 
 static void
@@ -1786,19 +1937,23 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
             const struct tgsi_full_instruction *inst,
             enum lp_build_tex_modifier modifier,
             boolean compare,
+            enum lp_sampler_op_type sample_type,
             LLVMValueRef *texel)
 {
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    unsigned texture_unit, sampler_unit;
-   LLVMValueRef lod_bias, explicit_lod;
+   LLVMValueRef lod = NULL;
    LLVMValueRef coords[5];
    LLVMValueRef offsets[3] = { NULL };
    struct lp_derivatives derivs;
-   struct lp_derivatives *deriv_ptr = NULL;
+   struct lp_sampler_params params;
    enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
 
    unsigned num_offsets, num_derivs, i;
    unsigned layer_coord = 0;
+   unsigned sample_key = sample_type << LP_SAMPLER_OP_TYPE_SHIFT;
+
+   memset(&params, 0, sizeof(params));
 
    if (!bld->sampler) {
       _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -1860,25 +2015,19 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
 
    if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
        modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-      LLVMValueRef lod = lp_build_emit_fetch(&bld->bld_base, inst, 3, 0);
+      lod = lp_build_emit_fetch(&bld->bld_base, inst, 3, 0);
       if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
-         lod_bias = lod;
-         explicit_lod = NULL;
+         sample_key |= LP_SAMPLER_LOD_BIAS << LP_SAMPLER_LOD_CONTROL_SHIFT;
       }
       else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-         lod_bias = NULL;
-         explicit_lod = lod;
+         sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT;
       }
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
    }
    else if (modifier == LP_BLD_TEX_MODIFIER_LOD_ZERO) {
-      lod_bias = NULL;
       /* XXX might be better to explicitly pass the level zero information */
-      explicit_lod = lp_build_const_vec(gallivm, bld->bld_base.base.type, 0.0F);
-   }
-   else {
-      lod_bias = NULL;
-      explicit_lod = NULL;
+      sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT;
+      lod = lp_build_const_vec(gallivm, bld->bld_base.base.type, 0.0F);
    }
 
    for (i = 0; i < num_derivs; i++) {
@@ -1897,22 +2046,24 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
    }
    /* Shadow coord occupies always 5th slot. */
    if (compare) {
+      sample_key |= LP_SAMPLER_SHADOW;
       coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 3, 0);
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
       unsigned dim;
+      sample_key |= LP_SAMPLER_LOD_DERIVATIVES << LP_SAMPLER_LOD_CONTROL_SHIFT;
       for (dim = 0; dim < num_derivs; ++dim) {
          derivs.ddx[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 3, dim);
          derivs.ddy[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 4, dim);
       }
-      deriv_ptr = &derivs;
+      params.derivs = &derivs;
       /*
        * could also check all src regs if constant but I doubt such
        * cases exist in practice.
        */
-      if (bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT) {
-         if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+      if (bld->bld_base.info->processor == PIPE_SHADER_FRAGMENT) {
+         if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD) {
             lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
          }
          else {
@@ -1927,26 +2078,32 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
    /* some advanced gather instructions (txgo) would require 4 offsets */
    if (inst->Texture.NumOffsets == 1) {
       unsigned dim;
+      sample_key |= LP_SAMPLER_OFFSETS;
       for (dim = 0; dim < num_offsets; dim++) {
          offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim);
       }
    }
+   sample_key |= lod_property << LP_SAMPLER_LOD_PROPERTY_SHIFT;
+
+   params.type = bld->bld_base.base.type;
+   params.sample_key = sample_key;
+   params.texture_index = texture_unit;
+   params.sampler_index = sampler_unit;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.offsets = offsets;
+   params.lod = lod;
+   params.texel = texel;
+
+   bld->sampler->emit_tex_sample(bld->sampler,
+                                 bld->bld_base.base.gallivm,
+                                 &params);
 
-   bld->sampler->emit_fetch_texel(bld->sampler,
-                                  bld->bld_base.base.gallivm,
-                                  bld->bld_base.base.type,
-                                  FALSE,
-                                  texture_unit, sampler_unit,
-                                  coords,
-                                  offsets,
-                                  deriv_ptr,
-                                  lod_bias, explicit_lod, lod_property,
-                                  texel);
-
-   if (inst->Src[1].Register.SwizzleX != PIPE_SWIZZLE_RED ||
-       inst->Src[1].Register.SwizzleY != PIPE_SWIZZLE_GREEN ||
-       inst->Src[1].Register.SwizzleZ != PIPE_SWIZZLE_BLUE ||
-       inst->Src[1].Register.SwizzleW != PIPE_SWIZZLE_ALPHA) {
+   if (inst->Src[1].Register.SwizzleX != PIPE_SWIZZLE_X ||
+       inst->Src[1].Register.SwizzleY != PIPE_SWIZZLE_Y ||
+       inst->Src[1].Register.SwizzleZ != PIPE_SWIZZLE_Z ||
+       inst->Src[1].Register.SwizzleW != PIPE_SWIZZLE_W) {
       unsigned char swizzles[4];
       swizzles[0] = inst->Src[1].Register.SwizzleX;
       swizzles[1] = inst->Src[1].Register.SwizzleY;
@@ -1966,11 +2123,15 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    unsigned unit, target;
    LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
    LLVMValueRef explicit_lod = NULL;
-   LLVMValueRef coords[3];
+   LLVMValueRef coords[5];
    LLVMValueRef offsets[3] = { NULL };
+   struct lp_sampler_params params;
    enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
    unsigned dims, i;
    unsigned layer_coord = 0;
+   unsigned sample_key = LP_SAMPLER_OP_FETCH << LP_SAMPLER_OP_TYPE_SHIFT;
+
+   memset(&params, 0, sizeof(params));
 
    if (!bld->sampler) {
       _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -2000,9 +2161,11 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
       break;
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_2D_MSAA:
       dims = 2;
       break;
    case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
       layer_coord = 2;
       dims = 2;
       break;
@@ -2014,16 +2177,25 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
       return;
    }
 
-   /* always have lod except for buffers ? */
-   if (target != TGSI_TEXTURE_BUFFER) {
+   /* always have lod except for buffers and msaa targets ? */
+   if (target != TGSI_TEXTURE_BUFFER &&
+       target != TGSI_TEXTURE_2D_MSAA &&
+       target != TGSI_TEXTURE_2D_ARRAY_MSAA &&
+       inst->Instruction.Opcode != TGSI_OPCODE_TXF_LZ) {
+      sample_key |= LP_SAMPLER_LOD_EXPLICIT << LP_SAMPLER_LOD_CONTROL_SHIFT;
       explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
       lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
    }
+   /*
+    * XXX: for real msaa support, the w component (or src2.x for sample_i_ms)
+    * would be the sample index.
+    */
 
    for (i = 0; i < dims; i++) {
       coords[i] = lp_build_emit_fetch(&bld->bld_base, inst, 0, i);
    }
-   for (i = dims; i < 3; i++) {
+   /* never use more than 3 coords here but emit_fetch_texel copies all 5 anyway */
+   for (i = dims; i < 5; i++) {
       coords[i] = coord_undef;
    }
    if (layer_coord)
@@ -2031,27 +2203,39 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
 
    if (inst->Texture.NumOffsets == 1) {
       unsigned dim;
+      sample_key |= LP_SAMPLER_OFFSETS;
       for (dim = 0; dim < dims; dim++) {
          offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim);
       }
    }
+   sample_key |= lod_property << LP_SAMPLER_LOD_PROPERTY_SHIFT;
 
-   bld->sampler->emit_fetch_texel(bld->sampler,
-                                  bld->bld_base.base.gallivm,
-                                  bld->bld_base.base.type,
-                                  TRUE,
-                                  unit, unit,
-                                  coords,
-                                  offsets,
-                                  NULL,
-                                  NULL, explicit_lod, lod_property,
-                                  texel);
+   params.type = bld->bld_base.base.type;
+   params.sample_key = sample_key;
+   params.texture_index = unit;
+   /*
+    * sampler not actually used, set to 0 so it won't exceed PIPE_MAX_SAMPLERS
+    * and trigger some assertions with d3d10 where the sampler view number
+    * can exceed this.
+    */
+   params.sampler_index = 0;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.offsets = offsets;
+   params.derivs = NULL;
+   params.lod = explicit_lod;
+   params.texel = texel;
+
+   bld->sampler->emit_tex_sample(bld->sampler,
+                                 bld->bld_base.base.gallivm,
+                                 &params);
 
    if (is_samplei &&
-       (inst->Src[1].Register.SwizzleX != PIPE_SWIZZLE_RED ||
-        inst->Src[1].Register.SwizzleY != PIPE_SWIZZLE_GREEN ||
-        inst->Src[1].Register.SwizzleZ != PIPE_SWIZZLE_BLUE ||
-        inst->Src[1].Register.SwizzleW != PIPE_SWIZZLE_ALPHA)) {
+       (inst->Src[1].Register.SwizzleX != PIPE_SWIZZLE_X ||
+        inst->Src[1].Register.SwizzleY != PIPE_SWIZZLE_Y ||
+        inst->Src[1].Register.SwizzleZ != PIPE_SWIZZLE_Z ||
+        inst->Src[1].Register.SwizzleW != PIPE_SWIZZLE_W)) {
       unsigned char swizzles[4];
       swizzles[0] = inst->Src[1].Register.SwizzleX;
       swizzles[1] = inst->Src[1].Register.SwizzleY;
@@ -2074,6 +2258,7 @@ emit_size_query( struct lp_build_tgsi_soa_context *bld,
    unsigned i;
    unsigned unit = inst->Src[1].Register.Index;
    unsigned target, pipe_target;
+   struct lp_sampler_size_query_params params;
 
    if (is_sviewinfo) {
       target = bld->sv[unit].Resource;
@@ -2111,47 +2296,62 @@ emit_size_query( struct lp_build_tgsi_soa_context *bld,
 
    pipe_target = tgsi_to_pipe_tex_target(target);
 
+   params.int_type = bld->bld_base.int_bld.type;
+   params.texture_unit = unit;
+   params.target = pipe_target;
+   params.context_ptr = bld->context_ptr;
+   params.is_sviewinfo = TRUE;
+   params.lod_property = lod_property;
+   params.explicit_lod = explicit_lod;
+   params.sizes_out = sizes_out;
+
    bld->sampler->emit_size_query(bld->sampler,
                                  bld->bld_base.base.gallivm,
-                                 bld->bld_base.int_bld.type,
-                                 unit, pipe_target,
-                                 is_sviewinfo,
-                                 lod_property,
-                                 explicit_lod,
-                                 sizes_out);
+                                 &params);
 }
 
 static boolean
 near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
-                  int pc)
+                   int pc)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < 5; i++) {
-      unsigned opcode;
+      enum tgsi_opcode opcode;
 
       if (pc + i >= bld->bld_base.info->num_instructions)
-        return TRUE;
+         return TRUE;
 
       opcode = bld->bld_base.instructions[pc + i].Instruction.Opcode;
 
       if (opcode == TGSI_OPCODE_END)
-        return TRUE;
+         return TRUE;
 
       if (opcode == TGSI_OPCODE_TEX ||
-         opcode == TGSI_OPCODE_TXP ||
-         opcode == TGSI_OPCODE_TXD ||
-         opcode == TGSI_OPCODE_TXB ||
-         opcode == TGSI_OPCODE_TXL ||
-         opcode == TGSI_OPCODE_TXF ||
-         opcode == TGSI_OPCODE_TXQ ||
-         opcode == TGSI_OPCODE_CAL ||
-         opcode == TGSI_OPCODE_CALLNZ ||
-         opcode == TGSI_OPCODE_IF ||
-          opcode == TGSI_OPCODE_UIF ||
-         opcode == TGSI_OPCODE_BGNLOOP ||
-         opcode == TGSI_OPCODE_SWITCH)
-        return FALSE;
+         opcode == TGSI_OPCODE_TXP ||
+         opcode == TGSI_OPCODE_TXD ||
+         opcode == TGSI_OPCODE_TXB ||
+         opcode == TGSI_OPCODE_TXL ||
+         opcode == TGSI_OPCODE_TXF ||
+         opcode == TGSI_OPCODE_TXQ ||
+         opcode == TGSI_OPCODE_TEX2 ||
+         opcode == TGSI_OPCODE_TXB2 ||
+         opcode == TGSI_OPCODE_TXL2 ||
+         opcode == TGSI_OPCODE_SAMPLE ||
+         opcode == TGSI_OPCODE_SAMPLE_B ||
+         opcode == TGSI_OPCODE_SAMPLE_C ||
+         opcode == TGSI_OPCODE_SAMPLE_C_LZ ||
+         opcode == TGSI_OPCODE_SAMPLE_D ||
+         opcode == TGSI_OPCODE_SAMPLE_I ||
+         opcode == TGSI_OPCODE_SAMPLE_I_MS ||
+         opcode == TGSI_OPCODE_SAMPLE_L ||
+         opcode == TGSI_OPCODE_SVIEWINFO ||
+         opcode == TGSI_OPCODE_CAL ||
+         opcode == TGSI_OPCODE_IF ||
+         opcode == TGSI_OPCODE_UIF ||
+         opcode == TGSI_OPCODE_BGNLOOP ||
+         opcode == TGSI_OPCODE_SWITCH)
+         return FALSE;
    }
 
    return TRUE;
@@ -2206,12 +2406,15 @@ emit_kill_if(
       }
    }
 
-   if(mask) {
-      lp_build_mask_update(bld->mask, mask);
-
-      if (!near_end_of_shader(bld, pc))
-        lp_build_mask_check(bld->mask);
+   if (bld->exec_mask.has_mask) {
+      LLVMValueRef invmask;
+      invmask = LLVMBuildNot(builder, bld->exec_mask.exec_mask, "kilp");
+      mask = LLVMBuildOr(builder, mask, invmask, "");
    }
+
+   lp_build_mask_update(bld->mask, mask);
+   if (!near_end_of_shader(bld, pc))
+      lp_build_mask_check(bld->mask);
 }
 
 
@@ -2250,42 +2453,79 @@ emit_kill(struct lp_build_tgsi_soa_context *bld,
  * to stdout.
  */
 static void
-emit_dump_temps(struct lp_build_tgsi_soa_context *bld)
+emit_dump_file(struct lp_build_tgsi_soa_context *bld,
+               unsigned file)
 {
+   const struct tgsi_shader_info *info = bld->bld_base.info;
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef temp_ptr;
-   LLVMValueRef i0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef i1 = lp_build_const_int32(gallivm, 1);
-   LLVMValueRef i2 = lp_build_const_int32(gallivm, 2);
-   LLVMValueRef i3 = lp_build_const_int32(gallivm, 3);
+   LLVMValueRef reg_ptr;
    int index;
-   int n = bld->bld_base.info->file_max[TGSI_FILE_TEMPORARY];
+   int max_index = info->file_max[file];
 
-   for (index = 0; index < n; index++) {
-      LLVMValueRef idx = lp_build_const_int32(gallivm, index);
-      LLVMValueRef v[4][4], res;
+   /*
+    * Some register files, particularly constants, can be very large,
+    * and dumping everything could make this unusably slow.
+    */
+   max_index = MIN2(max_index, 32);
+
+   for (index = 0; index <= max_index; index++) {
+      LLVMValueRef res;
+      unsigned mask;
       int chan;
 
-      lp_build_printf(gallivm, "TEMP[%d]:\n", idx);
+      if (index < 8 * sizeof(unsigned) &&
+          (info->file_mask[file] & (1u << index)) == 0)  {
+         /* This was not declared.*/
+         continue;
+      }
+
+      if (file == TGSI_FILE_INPUT) {
+         mask = info->input_usage_mask[index];
+      } else {
+         mask = TGSI_WRITEMASK_XYZW;
+      }
 
       for (chan = 0; chan < 4; chan++) {
-         temp_ptr = lp_get_temp_ptr_soa(bld, index, chan);
-         res = LLVMBuildLoad(builder, temp_ptr, "");
-         v[chan][0] = LLVMBuildExtractElement(builder, res, i0, "");
-         v[chan][1] = LLVMBuildExtractElement(builder, res, i1, "");
-         v[chan][2] = LLVMBuildExtractElement(builder, res, i2, "");
-         v[chan][3] = LLVMBuildExtractElement(builder, res, i3, "");
-      }
+         if ((mask & (1 << chan)) == 0) {
+            /* This channel is not used.*/
+            continue;
+         }
+
+         if (file == TGSI_FILE_CONSTANT) {
+            struct tgsi_full_src_register reg;
+            memset(&reg, 0, sizeof reg);
+            reg.Register.File = file;
+            reg.Register.Index = index;
+            reg.Register.SwizzleX = 0;
+            reg.Register.SwizzleY = 1;
+            reg.Register.SwizzleZ = 2;
+            reg.Register.SwizzleW = 3;
+
+            res = bld->bld_base.emit_fetch_funcs[file](&bld->bld_base, &reg, TGSI_TYPE_FLOAT, chan);
+            if (!res) {
+               continue;
+            }
+         } else if (file == TGSI_FILE_INPUT) {
+            res = bld->inputs[index][chan];
+            if (!res) {
+               continue;
+            }
+         } else if (file == TGSI_FILE_TEMPORARY) {
+            reg_ptr = lp_get_temp_ptr_soa(bld, index, chan);
+            assert(reg_ptr);
+            res = LLVMBuildLoad(builder, reg_ptr, "");
+         } else if (file == TGSI_FILE_OUTPUT) {
+            reg_ptr = lp_get_output_ptr(bld, index, chan);
+            assert(reg_ptr);
+            res = LLVMBuildLoad(builder, reg_ptr, "");
+         } else {
+            assert(0);
+            continue;
+         }
 
-      lp_build_printf(gallivm, "  X: %f %f %f %f\n",
-                      v[0][0], v[0][1], v[0][2], v[0][3]);
-      lp_build_printf(gallivm, "  Y: %f %f %f %f\n",
-                      v[1][0], v[1][1], v[1][2], v[1][3]);
-      lp_build_printf(gallivm, "  Z: %f %f %f %f\n",
-                      v[2][0], v[2][1], v[2][2], v[2][3]);
-      lp_build_printf(gallivm, "  W: %f %f %f %f\n",
-                      v[3][0], v[3][1], v[3][2], v[3][3]);
+         emit_dump_reg(gallivm, file, index, chan, res);
+      }
    }
 }
 
@@ -2303,56 +2543,90 @@ lp_emit_declaration_soa(
    const unsigned last = decl->Range.Last;
    unsigned idx, i;
 
-   for (idx = first; idx <= last; ++idx) {
-      assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
-      switch (decl->Declaration.File) {
-      case TGSI_FILE_TEMPORARY:
-         assert(idx < LP_MAX_TGSI_TEMPS);
-         if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
+   assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
+         assert(last < LP_MAX_INLINED_TEMPS);
+         for (idx = first; idx <= last; ++idx) {
             for (i = 0; i < TGSI_NUM_CHANNELS; i++)
                bld->temps[idx][i] = lp_build_alloca(gallivm, vec_type, "temp");
          }
-         break;
+      }
+      break;
 
-      case TGSI_FILE_OUTPUT:
-         if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+   case TGSI_FILE_OUTPUT:
+      if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+         for (idx = first; idx <= last; ++idx) {
             for (i = 0; i < TGSI_NUM_CHANNELS; i++)
                bld->outputs[idx][i] = lp_build_alloca(gallivm,
                                                       vec_type, "output");
          }
-         break;
+      }
+      break;
 
-      case TGSI_FILE_ADDRESS:
-        /* ADDR registers are only allocated with an integer LLVM IR type,
-         * as they are guaranteed to always have integers.
-         * XXX: Not sure if this exception is worthwhile (or the whole idea of
-         * an ADDR register for that matter).
-         */
+   case TGSI_FILE_ADDRESS:
+      /* ADDR registers are only allocated with an integer LLVM IR type,
+       * as they are guaranteed to always have integers.
+       * XXX: Not sure if this exception is worthwhile (or the whole idea of
+       * an ADDR register for that matter).
+       */
+      assert(last < LP_MAX_TGSI_ADDRS);
+      for (idx = first; idx <= last; ++idx) {
          assert(idx < LP_MAX_TGSI_ADDRS);
          for (i = 0; i < TGSI_NUM_CHANNELS; i++)
             bld->addr[idx][i] = lp_build_alloca(gallivm, bld_base->base.int_vec_type, "addr");
-         break;
-
-      case TGSI_FILE_PREDICATE:
-         assert(idx < LP_MAX_TGSI_PREDS);
-         for (i = 0; i < TGSI_NUM_CHANNELS; i++)
-            bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
-                                                 "predicate");
-         break;
+      }
+      break;
 
-      case TGSI_FILE_SAMPLER_VIEW:
-         /*
-          * The target stored here MUST match whatever there actually
-          * is in the set sampler views (what about return type?).
-          */
-         assert(idx < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+   case TGSI_FILE_SAMPLER_VIEW:
+      /*
+       * The target stored here MUST match whatever there actually
+       * is in the set sampler views (what about return type?).
+       */
+      assert(last < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+      for (idx = first; idx <= last; ++idx) {
          bld->sv[idx] = decl->SamplerView;
-         break;
-
-      default:
-         /* don't need to declare other vars */
-         break;
       }
+      break;
+
+   case TGSI_FILE_CONSTANT:
+   {
+      /*
+       * We could trivially fetch the per-buffer pointer when fetching the
+       * constant, relying on llvm to figure out it's always the same pointer
+       * anyway. However, doing so results in a huge (more than factor of 10)
+       * slowdown in llvm compilation times for some (but not all) shaders
+       * (more specifically, the IR optimization spends way more time in
+       * DominatorTree::dominates). At least with llvm versions 3.1, 3.3.
+       */
+      unsigned idx2D = decl->Dim.Index2D;
+      LLVMValueRef index2D = lp_build_const_int32(gallivm, idx2D);
+      assert(idx2D < LP_MAX_TGSI_CONST_BUFFERS);
+      bld->consts[idx2D] =
+         lp_build_array_get(gallivm, bld->consts_ptr, index2D);
+      bld->consts_sizes[idx2D] =
+         lp_build_array_get(gallivm, bld->const_sizes_ptr, index2D);
+   }
+   break;
+   case TGSI_FILE_BUFFER:
+   {
+      unsigned idx = decl->Range.First;
+      LLVMValueRef index = lp_build_const_int32(gallivm, idx);
+      assert(idx < LP_MAX_TGSI_SHADER_BUFFERS);
+      bld->ssbos[idx] =
+         lp_build_array_get(gallivm, bld->ssbo_ptr, index);
+      bld->ssbo_sizes[idx] =
+         lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, index);
+
+   }
+   break;
+   case TGSI_FILE_MEMORY:
+      break;
+   default:
+      /* don't need to declare other vars */
+      break;
    }
 }
 
@@ -2363,51 +2637,75 @@ void lp_emit_immediate_soa(
 {
    struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
    struct gallivm_state * gallivm = bld_base->base.gallivm;
-
-   /* simply copy the immediate values into the next immediates[] slot */
+   LLVMValueRef imms[4];
    unsigned i;
    const uint size = imm->Immediate.NrTokens - 1;
    assert(size <= 4);
-   assert(bld->num_immediates < LP_MAX_TGSI_IMMEDIATES);
    switch (imm->Immediate.DataType) {
    case TGSI_IMM_FLOAT32:
       for( i = 0; i < size; ++i )
-         bld->immediates[bld->num_immediates][i] =
-            lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
+         imms[i] =
+               lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
 
       break;
+   case TGSI_IMM_FLOAT64:
+   case TGSI_IMM_UINT64:
+   case TGSI_IMM_INT64:
    case TGSI_IMM_UINT32:
       for( i = 0; i < size; ++i ) {
          LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
-         bld->immediates[bld->num_immediates][i] =
-            LLVMConstBitCast(tmp, bld_base->base.vec_type);
+         imms[i] = LLVMConstBitCast(tmp, bld_base->base.vec_type);
       }
 
       break;
    case TGSI_IMM_INT32:
       for( i = 0; i < size; ++i ) {
          LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->int_bld.type, imm->u[i].Int);
-         bld->immediates[bld->num_immediates][i] =
-            LLVMConstBitCast(tmp, bld_base->base.vec_type);
+         imms[i] = LLVMConstBitCast(tmp, bld_base->base.vec_type);
       }
-            
+
       break;
    }
    for( i = size; i < 4; ++i )
-      bld->immediates[bld->num_immediates][i] = bld_base->base.undef;
+      imms[i] = bld_base->base.undef;
 
-   if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
+   if (bld->use_immediates_array) {
       unsigned index = bld->num_immediates;
       struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
       LLVMBuilderRef builder = gallivm->builder;
+      LLVMValueRef gep[2];
+      gep[0] = lp_build_const_int32(gallivm, 0);
+
+      assert(bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE));
       for (i = 0; i < 4; ++i ) {
-         LLVMValueRef lindex = lp_build_const_int32(
-            bld->bld_base.base.gallivm, index * 4 + i);
+         gep[1] = lp_build_const_int32(gallivm, index * 4 + i);
          LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
-                                             bld->imms_array, &lindex, 1, "");
-         LLVMBuildStore(builder, 
-                        bld->immediates[index][i],
-                        imm_ptr);
+                                             bld->imms_array, gep, 2, "");
+         LLVMBuildStore(builder, imms[i], imm_ptr);
+      }
+   } else {
+      /* simply copy the immediate values into the next immediates[] slot */
+      unsigned i;
+      assert(imm->Immediate.NrTokens - 1 <= 4);
+      assert(bld->num_immediates < LP_MAX_INLINED_IMMEDIATES);
+
+      for(i = 0; i < 4; ++i )
+         bld->immediates[bld->num_immediates][i] = imms[i];
+
+      if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
+         unsigned index = bld->num_immediates;
+         struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+         LLVMBuilderRef builder = gallivm->builder;
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(gallivm, 0);
+         for (i = 0; i < 4; ++i ) {
+            gep[1] = lp_build_const_int32(gallivm, index * 4 + i);
+            LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
+                                                bld->imms_array, gep, 2, "");
+            LLVMBuildStore(builder,
+                           bld->immediates[index][i],
+                           imm_ptr);
+         }
       }
    }
 
@@ -2468,7 +2766,20 @@ tex_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE, emit_data->output);
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
+}
+
+static void
+tex2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 2, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -2480,7 +2791,19 @@ txb_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
-            emit_data->output);
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
+}
+
+static void
+txb2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
+            emit_data->output, 2, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -2492,7 +2815,7 @@ txd_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
-            emit_data->output);
+            emit_data->output, 3, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -2504,149 +2827,738 @@ txl_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
-            emit_data->output);
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
+}
+
+static void
+txl2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+            emit_data->output, 2, LP_SAMPLER_OP_TEXTURE);
+}
+
+static void
+txp_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_PROJECTED,
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
+}
+
+static void
+tg4_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 2, LP_SAMPLER_OP_GATHER);
+}
+
+static void
+lodq_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 1, LP_SAMPLER_OP_LODQ);
+}
+
+static void
+txq_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_size_query(bld, emit_data->inst, emit_data->output, FALSE);
+}
+
+static void
+txf_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_fetch_texels(bld, emit_data->inst, emit_data->output, FALSE);
+}
+
+static void
+sample_i_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_fetch_texels(bld, emit_data->inst, emit_data->output, TRUE);
+}
+
+static void
+sample_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               FALSE, LP_SAMPLER_OP_TEXTURE, emit_data->output);
+}
+
+static void
+sample_b_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
+               FALSE, LP_SAMPLER_OP_TEXTURE, emit_data->output);
+}
+
+static void
+sample_c_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               TRUE, LP_SAMPLER_OP_TEXTURE, emit_data->output);
+}
+
+static void
+sample_c_lz_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_ZERO,
+               TRUE, LP_SAMPLER_OP_TEXTURE, emit_data->output);
+}
+
+static void
+sample_d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
+               FALSE, LP_SAMPLER_OP_TEXTURE, emit_data->output);
+}
+
+static void
+sample_l_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+               FALSE, LP_SAMPLER_OP_TEXTURE, emit_data->output);
+}
+
+static void
+gather4_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               FALSE, LP_SAMPLER_OP_GATHER, emit_data->output);
+}
+
+static void
+sviewinfo_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_size_query(bld, emit_data->inst, emit_data->output, TRUE);
+}
+
+static void
+lod_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               FALSE, LP_SAMPLER_OP_LODQ, emit_data->output);
+}
+
+static void target_to_dims_layer(unsigned target,
+                                 unsigned *dims,
+                                 unsigned *layer_coord)
+{
+   *layer_coord = 0;
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_BUFFER:
+      *dims = 1;
+      break;
+   case TGSI_TEXTURE_1D_ARRAY:
+      *layer_coord = 1;
+      *dims = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      *dims = 2;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY:
+      *layer_coord = 2;
+      *dims = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      *dims = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
 }
 
 static void
-txp_emit(
+img_load_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_img_params params;
+   LLVMValueRef coords[5];
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   unsigned dims;
+   unsigned target = emit_data->inst->Memory.Texture;
+   unsigned layer_coord;
 
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_PROJECTED,
-            emit_data->output);
+   target_to_dims_layer(target, &dims, &layer_coord);
+
+   for (unsigned i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i);
+   }
+   for (unsigned i = dims; i < 5; i++) {
+      coords[i] = coord_undef;
+   }
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, layer_coord);
+
+   memset(&params, 0, sizeof(params));
+
+   params.type = bld->bld_base.base.type;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.outdata = emit_data->output;
+   params.target = tgsi_to_pipe_tex_target(target);
+   params.image_index = emit_data->inst->Src[0].Register.Index;
+   params.img_op = LP_IMG_LOAD;
+   bld->image->emit_op(bld->image,
+                         bld->bld_base.base.gallivm,
+                         &params);
 }
 
 static void
-txq_emit(
+load_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
+   unsigned buf = bufreg->Register.Index;
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER ||
+          bufreg->Register.File == TGSI_FILE_IMAGE ||
+          bufreg->Register.File == TGSI_FILE_MEMORY ||
+          bufreg->Register.File == TGSI_FILE_CONSTBUF);
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
 
-   emit_size_query(bld, emit_data->inst, emit_data->output, FALSE);
-}
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      img_load_emit(action, bld_base, emit_data);
+   } else if (bufreg->Register.File == TGSI_FILE_CONSTBUF) {
+      LLVMValueRef consts_ptr = bld->consts[buf];
+      LLVMValueRef num_consts = bld->consts_sizes[buf];
 
-static void
-txf_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+      LLVMValueRef indirect_index;
+      LLVMValueRef overflow_mask;
 
-   emit_fetch_texels(bld, emit_data->inst, emit_data->output, FALSE);
-}
+      indirect_index = lp_build_emit_fetch(bld_base, emit_data->inst, 1, 0);
+      indirect_index = lp_build_shr_imm(uint_bld, indirect_index, 4);
 
-static void
-sample_i_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+      /* All fetches are from the same constant buffer, so
+       * we need to propagate the size to a vector to do a
+       * vector comparison */
+      num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
 
-   emit_fetch_texels(bld, emit_data->inst, emit_data->output, TRUE);
+      /* Gather values from the constant buffer */
+      unsigned chan_index;
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         /* Construct a boolean vector telling us which channels
+          * overflow the bound constant buffer */
+         overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
+                                          indirect_index, num_consts);
+
+         /* index_vec = indirect_index * 4 */
+         LLVMValueRef index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec,
+                                  lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         emit_data->output[chan_index] = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, NULL);
+      }
+   } else if (0) {
+      /* for indirect support with ARB_gpu_shader5 */
+   } else {
+      LLVMValueRef index;
+      LLVMValueRef scalar, scalar_ptr;
+      unsigned chan_index;
+
+      index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, 0);
+      index = lp_build_shr_imm(uint_bld, index, 2);
+
+      scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf];
+
+      LLVMValueRef ssbo_limit;
+
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
+
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         LLVMValueRef exec_mask = mask_vec(bld_base);
+         if (!is_shared) {
+            LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
+            exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         }
+
+         LLVMValueRef result = lp_build_alloca(gallivm, uint_bld->vec_type, "");
+         struct lp_build_loop_state loop_state;
+         lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+         struct lp_build_if_state ifthen;
+         LLVMValueRef cond, temp_res;
+
+         loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
+                                              loop_state.counter, "");
+
+         cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+         cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+
+         lp_build_if(&ifthen, gallivm, cond);
+         scalar = lp_build_pointer_get(builder, scalar_ptr, loop_index);
+
+         temp_res = LLVMBuildLoad(builder, result, "");
+         temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
+         LLVMBuildStore(builder, temp_res, result);
+         lp_build_else(&ifthen);
+         temp_res = LLVMBuildLoad(builder, result, "");
+         temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, "");
+         LLVMBuildStore(builder, temp_res, result);
+         lp_build_endif(&ifthen);
+         lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                                NULL, LLVMIntUGE);
+         emit_data->output[chan_index] = LLVMBuildLoad(gallivm->builder, result, "");
+      }
+   }
 }
 
 static void
-sample_emit(
+img_store_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_img_params params;
+   LLVMValueRef coords[5];
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   unsigned dims;
+   unsigned target = emit_data->inst->Memory.Texture;
+   unsigned layer_coord;
 
-   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
-               FALSE, emit_data->output);
+   target_to_dims_layer(target, &dims, &layer_coord);
+   for (unsigned i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, i);
+   }
+   for (unsigned i = dims; i < 5; i++) {
+      coords[i] = coord_undef;
+   }
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, layer_coord);
+   memset(&params, 0, sizeof(params));
+
+   params.type = bld->bld_base.base.type;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.coords = coords;
+   params.outdata = NULL;
+   params.exec_mask = mask_vec(bld_base);
+   params.target = tgsi_to_pipe_tex_target(target);
+   params.image_index = emit_data->inst->Dst[0].Register.Index;
+   params.img_op = LP_IMG_STORE;
+   for (unsigned i = 0; i < 4; i++)
+      params.indata[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i);
+
+   bld->image->emit_op(bld->image,
+                       bld->bld_base.base.gallivm,
+                       &params);
 }
 
 static void
-sample_b_emit(
+store_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   const struct tgsi_full_dst_register *bufreg = &emit_data->inst->Dst[0];
+   unsigned buf = bufreg->Register.Index;
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY);
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
 
-   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
-               FALSE, emit_data->output);
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      img_store_emit(action, bld_base, emit_data);
+   } else if (0) {
+
+   } else {
+      LLVMValueRef index;  /* index into the const buffer */
+      LLVMValueRef scalar_ptr;
+      LLVMValueRef value;
+      unsigned chan_index;
+
+      index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, 0);
+      index = lp_build_shr_imm(uint_bld, index, 2);
+
+      scalar_ptr = is_shared ? bld->shared_ptr : bld->ssbos[buf];
+
+      LLVMValueRef ssbo_limit;
+
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
+
+      TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(emit_data->inst, chan_index) {
+         LLVMValueRef loop_index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, chan_index));
+
+         value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, chan_index);
+
+         LLVMValueRef exec_mask = mask_vec(bld_base);
+         if (!is_shared) {
+            LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
+            exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         }
+
+         struct lp_build_loop_state loop_state;
+         lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+         LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, value,
+                                                          loop_state.counter, "");
+         value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, uint_bld->elem_type, "");
+
+         struct lp_build_if_state ifthen;
+         LLVMValueRef cond;
+
+         loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
+                                              loop_state.counter, "");
+
+         cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+         cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+         lp_build_if(&ifthen, gallivm, cond);
+
+         lp_build_pointer_set(builder, scalar_ptr, loop_index, value_ptr);
+
+         lp_build_endif(&ifthen);
+         lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                                NULL, LLVMIntUGE);
+      }
+   }
 }
 
 static void
-sample_c_emit(
+resq_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
 
-   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
-               TRUE, emit_data->output);
+   unsigned buf = bufreg->Register.Index;
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE);
+
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      unsigned target = emit_data->inst->Memory.Texture;
+      struct lp_sampler_size_query_params params = { 0 };
+      params.int_type = bld->bld_base.int_bld.type;
+      params.texture_unit = buf;
+      params.target = tgsi_to_pipe_tex_target(target);
+      params.context_ptr = bld->context_ptr;
+      params.sizes_out = emit_data->output;
+
+      bld->image->emit_size_query(bld->image,
+                                  bld->bld_base.base.gallivm,
+                                  &params);
+   } else {
+      LLVMValueRef num_ssbo = bld->ssbo_sizes[buf];
+
+      emit_data->output[emit_data->chan] = lp_build_broadcast_scalar(uint_bld, num_ssbo);
+   }
 }
 
 static void
-sample_c_lz_emit(
+img_atomic_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
+   struct lp_build_emit_data * emit_data,
+   LLVMAtomicRMWBinOp op)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct lp_img_params params;
+   LLVMValueRef coords[5];
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   unsigned dims;
+   unsigned layer_coord;
+   unsigned target = emit_data->inst->Memory.Texture;
 
-   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_ZERO,
-               TRUE, emit_data->output);
+   target_to_dims_layer(target, &dims, &layer_coord);
+
+   for (unsigned i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, i);
+   }
+   for (unsigned i = dims; i < 5; i++) {
+      coords[i] = coord_undef;
+   }
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, layer_coord);
+   memset(&params, 0, sizeof(params));
+
+   params.type = bld->bld_base.base.type;
+   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
+   params.exec_mask = mask_vec(bld_base);
+   params.image_index = emit_data->inst->Src[0].Register.Index;
+   params.coords = coords;
+   params.target = tgsi_to_pipe_tex_target(target);
+   params.op = op;
+   params.outdata = emit_data->output;
+   params.img_op = (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) ? LP_IMG_ATOMIC_CAS : LP_IMG_ATOMIC;
+
+   for (unsigned i = 0; i < 4; i++)
+      params.indata[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, i);
+   if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+      for (unsigned i = 0; i < 4; i++)
+         params.indata2[i] = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 3, i);
+   }
+   bld->image->emit_op(bld->image,
+                       bld->bld_base.base.gallivm,
+                       &params);
 }
 
 static void
-sample_d_emit(
+atomic_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   const struct tgsi_full_src_register *bufreg = &emit_data->inst->Src[0];
 
-   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
-               FALSE, emit_data->output);
+   assert(bufreg->Register.File == TGSI_FILE_BUFFER || bufreg->Register.File == TGSI_FILE_IMAGE || bufreg->Register.File == TGSI_FILE_MEMORY);
+   unsigned buf = bufreg->Register.Index;
+   bool is_shared = bufreg->Register.File == TGSI_FILE_MEMORY;
+
+   LLVMAtomicRMWBinOp op;
+   switch (emit_data->inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      op = LLVMAtomicRMWBinOpAdd;
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      op = LLVMAtomicRMWBinOpXchg;
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      op = LLVMAtomicRMWBinOpAnd;
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      op = LLVMAtomicRMWBinOpOr;
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      op = LLVMAtomicRMWBinOpXor;
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      op = LLVMAtomicRMWBinOpUMin;
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      op = LLVMAtomicRMWBinOpUMax;
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      op = LLVMAtomicRMWBinOpMin;
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      op = LLVMAtomicRMWBinOpMax;
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (bufreg->Register.File == TGSI_FILE_IMAGE) {
+      img_atomic_emit(action, bld_base, emit_data, op);
+   } else if (0) {
+   } else {
+      LLVMValueRef index;  /* index into the const buffer */
+      LLVMValueRef scalar, scalar_ptr;
+      LLVMValueRef value;
+
+      index = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 1, 0);
+      value = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 2, 0);
+
+      index = lp_build_shr_imm(uint_bld, index, 2);
+
+      if (!is_shared) {
+         index = lp_build_add(uint_bld, index, lp_build_const_int_vec(gallivm, uint_bld->type, emit_data->chan));
+         scalar_ptr = bld->ssbos[buf];
+      } else
+         scalar_ptr = bld->shared_ptr;
+
+      LLVMValueRef atom_res = lp_build_alloca(gallivm,
+                                              uint_bld->vec_type, "");
+
+      LLVMValueRef ssbo_limit;
+      if (!is_shared) {
+         ssbo_limit = LLVMBuildAShr(gallivm->builder, bld->ssbo_sizes[buf], lp_build_const_int32(gallivm, 2), "");
+         ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
+      }
+
+      LLVMValueRef exec_mask = mask_vec(bld_base);
+
+      if (!is_shared) {
+         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, index, ssbo_limit);
+         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+      }
+
+      struct lp_build_loop_state loop_state;
+      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+      LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, value,
+                                                       loop_state.counter, "");
+      value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, uint_bld->elem_type, "");
+
+      index = LLVMBuildExtractElement(gallivm->builder, index,
+                                      loop_state.counter, "");
+
+      scalar_ptr = LLVMBuildGEP(builder, scalar_ptr,
+                                &index, 1, "");
+
+      struct lp_build_if_state ifthen;
+      LLVMValueRef cond, temp_res;
+
+      cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+      lp_build_if(&ifthen, gallivm, cond);
+
+      if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+         LLVMValueRef cas_src = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 3, 0);
+         LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, cas_src,
+                                                            loop_state.counter, "");
+         cas_src_ptr = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, uint_bld->elem_type, "");
+         scalar = LLVMBuildAtomicCmpXchg(builder, scalar_ptr, value_ptr,
+                                         cas_src_ptr,
+                                         LLVMAtomicOrderingSequentiallyConsistent,
+                                         LLVMAtomicOrderingSequentiallyConsistent,
+                                         false);
+         scalar = LLVMBuildExtractValue(gallivm->builder, scalar, 0, "");
+      } else {
+         scalar = LLVMBuildAtomicRMW(builder, op,
+                                     scalar_ptr, value_ptr,
+                                     LLVMAtomicOrderingSequentiallyConsistent,
+                                     false);
+      }
+      temp_res = LLVMBuildLoad(builder, atom_res, "");
+      temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
+      LLVMBuildStore(builder, temp_res, atom_res);
+      lp_build_else(&ifthen);
+      temp_res = LLVMBuildLoad(builder, atom_res, "");
+      temp_res = LLVMBuildInsertElement(builder, temp_res, lp_build_const_int32(gallivm, 0), loop_state.counter, "");
+      LLVMBuildStore(builder, temp_res, atom_res);
+      lp_build_endif(&ifthen);
+
+      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                             NULL, LLVMIntUGE);
+      emit_data->output[emit_data->chan] = LLVMBuildLoad(gallivm->builder, atom_res, "");
+   }
 }
 
 static void
-sample_l_emit(
+barrier_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
 
-   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
-               FALSE, emit_data->output);
+   LLVMBasicBlockRef resume = lp_build_insert_new_block(gallivm, "resume");
+
+   lp_build_coro_suspend_switch(gallivm, bld->coro, resume, false);
+   LLVMPositionBuilderAtEnd(gallivm->builder, resume);
 }
 
 static void
-sviewinfo_emit(
+membar_emit(
    const struct lp_build_tgsi_action * action,
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-
-   emit_size_query(bld, emit_data->inst, emit_data->output, TRUE);
-}
-
-static LLVMValueRef
-mask_vec(struct lp_build_tgsi_context *bld_base)
-{
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   struct lp_exec_mask *exec_mask = &bld->exec_mask;
-
-   if (!exec_mask->has_mask) {
-      return lp_build_mask_value(bld->mask);
-   }
-   return LLVMBuildAnd(builder, lp_build_mask_value(bld->mask),
-                       exec_mask->exec_mask, "");
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMBuildFence(builder, LLVMAtomicOrderingSequentiallyConsistent, false, "");
 }
 
 static void
@@ -2702,15 +3614,18 @@ emit_vertex(
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 
    if (bld->gs_iface->emit_vertex) {
+      uint32_t imms_idx = emit_data->inst->Src[0].Register.SwizzleX;
+      LLVMValueRef stream_id = bld->immediates[0][imms_idx];
       LLVMValueRef mask = mask_vec(bld_base);
       LLVMValueRef total_emitted_vertices_vec =
          LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
       mask = clamp_mask_to_max_output_vertices(bld, mask,
                                                total_emitted_vertices_vec);
       gather_outputs(bld);
-      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base,
+      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base.base,
                                  bld->outputs,
-                                 total_emitted_vertices_vec);
+                                 total_emitted_vertices_vec,
+                                 stream_id);
       increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr,
                                 mask);
       increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr,
@@ -2740,7 +3655,8 @@ end_primitive_masked(struct lp_build_tgsi_context * bld_base,
          LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, "");
       LLVMValueRef emitted_prims_vec =
          LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
-
+      LLVMValueRef total_emitted_vertices_vec =
+         LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
       LLVMValueRef emitted_mask = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
                                                emitted_vertices_vec,
                                                uint_bld->zero);
@@ -2750,9 +3666,11 @@ end_primitive_masked(struct lp_build_tgsi_context * bld_base,
          executes only on the paths that have unflushed vertices */
       mask = LLVMBuildAnd(builder, mask, emitted_mask, "");
 
-      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base,
+      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base.base,
+                                   total_emitted_vertices_vec,
                                    emitted_vertices_vec,
-                                   emitted_prims_vec);
+                                   emitted_prims_vec,
+                                   mask_vec(bld_base));
 
 #if DUMP_GS_EMITS
       lp_build_print_value(bld->bld_base.base.gallivm,
@@ -2825,25 +3743,7 @@ brk_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   lp_exec_break(&bld->exec_mask, bld_base);
-}
-
-static void
-breakc_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-   struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   LLVMValueRef unsigned_cond = 
-      LLVMBuildBitCast(builder, emit_data->args[0], uint_bld->vec_type, "");
-   LLVMValueRef cond = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
-                                    unsigned_cond,
-                                    uint_bld->zero);
-
-   lp_exec_break_condition(&bld->exec_mask, cond);
+   lp_exec_tgsi_break(&bld->exec_mask, bld_base);
 }
 
 static void
@@ -2927,7 +3827,7 @@ bgnloop_emit(
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
-   lp_exec_bgnloop(&bld->exec_mask);
+   lp_exec_bgnloop(&bld->exec_mask, true);
 }
 
 static void
@@ -2996,111 +3896,16 @@ cont_emit(
    lp_exec_continue(&bld->exec_mask);
 }
 
-/* XXX: Refactor and move it to lp_bld_tgsi_action.c
- *
- * XXX: What do the comments about xmm registers mean?  Maybe they are left over
- * from old code, but there is no garauntee that LLVM will use those registers
- * for this code.
- *
- * XXX: There should be no calls to lp_build_emit_fetch in this function.  This
- * should be handled by the emit_data->fetch_args function. */
-static void
-nrm_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   LLVMValueRef tmp0, tmp1;
-   LLVMValueRef tmp4 = NULL;
-   LLVMValueRef tmp5 = NULL;
-   LLVMValueRef tmp6 = NULL;
-   LLVMValueRef tmp7 = NULL;
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-
-   uint dims = (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
-
-  if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X) ||
-      TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y) ||
-      TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z) ||
-      (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W) && dims == 4)) {
-
-      /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
-
-      /* xmm4 = src.x */
-      /* xmm0 = src.x * src.x */
-      tmp0 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_X);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X)) {
-         tmp4 = tmp0;
-      }
-      tmp0 = lp_build_mul( &bld->bld_base.base, tmp0, tmp0);
-
-      /* xmm5 = src.y */
-      /* xmm0 = xmm0 + src.y * src.y */
-      tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_Y);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y)) {
-         tmp5 = tmp1;
-      }
-      tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-      tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-
-      /* xmm6 = src.z */
-      /* xmm0 = xmm0 + src.z * src.z */
-      tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_Z);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z)) {
-         tmp6 = tmp1;
-      }
-      tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-      tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-
-      if (dims == 4) {
-         /* xmm7 = src.w */
-         /* xmm0 = xmm0 + src.w * src.w */
-         tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-         if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W)) {
-            tmp7 = tmp1;
-         }
-         tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-         tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-      }
-      /* xmm1 = 1 / sqrt(xmm0) */
-      tmp1 = lp_build_rsqrt( &bld->bld_base.base, tmp0);
-       /* dst.x = xmm1 * src.x */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X)) {
-         emit_data->output[TGSI_CHAN_X] = lp_build_mul( &bld->bld_base.base, tmp4, tmp1);
-      }
-      /* dst.y = xmm1 * src.y */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y)) {
-         emit_data->output[TGSI_CHAN_Y] = lp_build_mul( &bld->bld_base.base, tmp5, tmp1);
-      }
-
-      /* dst.z = xmm1 * src.z */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z)) {
-         emit_data->output[TGSI_CHAN_Z] = lp_build_mul( &bld->bld_base.base, tmp6, tmp1);
-      }
-      /* dst.w = xmm1 * src.w */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X) && dims == 4) {
-         emit_data->output[TGSI_CHAN_W] = lp_build_mul( &bld->bld_base.base, tmp7, tmp1);
-      }
-   }
-
-   /* dst.w = 1.0 */
-   if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W) && dims == 3) {
-       emit_data->output[TGSI_CHAN_W] = bld->bld_base.base.one;
-   }
-}
-
 static void emit_prologue(struct lp_build_tgsi_context * bld_base)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    struct gallivm_state * gallivm = bld_base->base.gallivm;
 
    if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
-      LLVMValueRef array_size =
-         lp_build_const_int32(gallivm,
-                         bld_base->info->file_max[TGSI_FILE_TEMPORARY] * 4 + 4);
-      bld->temps_array = lp_build_array_alloca(gallivm,
-                                              bld_base->base.vec_type, array_size,
-                                              "temp_array");
+      unsigned array_size = bld_base->info->file_max[TGSI_FILE_TEMPORARY] * 4 + 4;
+      bld->temps_array = lp_build_alloca_undef(gallivm,
+                                               LLVMArrayType(bld_base->base.vec_type, array_size),
+                                               "temp_array");
    }
 
    if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
@@ -3113,11 +3918,9 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
    }
 
    if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
-      LLVMValueRef array_size =
-         lp_build_const_int32(gallivm,
-                         bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4);
-      bld->imms_array = lp_build_array_alloca(gallivm,
-                                              bld_base->base.vec_type, array_size,
+      unsigned array_size = bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4;
+      bld->imms_array = lp_build_alloca_undef(gallivm,
+                                              LLVMArrayType(bld_base->base.vec_type, array_size),
                                               "imms_array");
    }
 
@@ -3171,6 +3974,13 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
       LLVMBuildStore(gallivm->builder, uint_bld->zero,
                      bld->total_emitted_vertices_vec_ptr);
    }
+
+   if (DEBUG_EXECUTION) {
+      lp_build_printf(gallivm, "\n");
+      emit_dump_file(bld, TGSI_FILE_CONSTANT);
+      if (!bld->gs_iface)
+         emit_dump_file(bld, TGSI_FILE_INPUT);
+   }
 }
 
 static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
@@ -3178,9 +3988,13 @@ static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    LLVMBuilderRef builder = bld_base->base.gallivm->builder;
 
-   if (0) {
+   if (DEBUG_EXECUTION) {
       /* for debugging */
-      emit_dump_temps(bld);
+      if (0) {
+         emit_dump_file(bld, TGSI_FILE_TEMPORARY);
+      }
+      emit_dump_file(bld, TGSI_FILE_OUTPUT);
+      lp_build_printf(bld_base->base.gallivm, "\n");
    }
 
    /* If we have indirect addressing in outputs we need to copy our alloca array
@@ -3199,7 +4013,6 @@ static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
          LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
 
       bld->gs_iface->gs_epilogue(bld->gs_iface,
-                                 &bld->bld_base,
                                  total_emitted_vertices_vec,
                                  emitted_prims_vec);
    } else {
@@ -3210,18 +4023,11 @@ static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
 void
 lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   const struct tgsi_token *tokens,
-                  struct lp_type type,
-                  struct lp_build_mask_context *mask,
-                  LLVMValueRef consts_ptr,
-                  const struct lp_bld_tgsi_system_values *system_values,
-                  const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
-                  LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
-                  struct lp_build_sampler_soa *sampler,
-                  const struct tgsi_shader_info *info,
-                  const struct lp_build_tgsi_gs_iface *gs_iface)
+                  const struct lp_build_tgsi_params *params,
+                  LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS])
 {
    struct lp_build_tgsi_soa_context bld;
-
+   struct lp_type type = params->type;
    struct lp_type res_type;
 
    assert(type.length <= LP_MAX_VECTOR_LENGTH);
@@ -3236,15 +4042,62 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
    lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
    lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
-   bld.mask = mask;
-   bld.inputs = inputs;
+   {
+      struct lp_type dbl_type;
+      dbl_type = type;
+      dbl_type.width *= 2;
+      lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
+   }
+   {
+      struct lp_type uint64_type;
+      uint64_type = lp_uint_type(type);
+      uint64_type.width *= 2;
+      lp_build_context_init(&bld.bld_base.uint64_bld, gallivm, uint64_type);
+   }
+   {
+      struct lp_type int64_type;
+      int64_type = lp_int_type(type);
+      int64_type.width *= 2;
+      lp_build_context_init(&bld.bld_base.int64_bld, gallivm, int64_type);
+   }
+   bld.mask = params->mask;
+   bld.inputs = params->inputs;
    bld.outputs = outputs;
-   bld.consts_ptr = consts_ptr;
-   bld.sampler = sampler;
-   bld.bld_base.info = info;
-   bld.indirect_files = info->indirect_files;
+   bld.consts_ptr = params->consts_ptr;
+   bld.const_sizes_ptr = params->const_sizes_ptr;
+   bld.ssbo_ptr = params->ssbo_ptr;
+   bld.ssbo_sizes_ptr = params->ssbo_sizes_ptr;
+   bld.sampler = params->sampler;
+   bld.bld_base.info = params->info;
+   bld.indirect_files = params->info->indirect_files;
+   bld.context_ptr = params->context_ptr;
+   bld.thread_data_ptr = params->thread_data_ptr;
+   bld.image = params->image;
+   bld.shared_ptr = params->shared_ptr;
+   bld.coro = params->coro;
+
+   /*
+    * If the number of temporaries is rather large then we just
+    * allocate them as an array right from the start and treat
+    * like indirect temporaries.
+    */
+   if (params->info->file_max[TGSI_FILE_TEMPORARY] >= LP_MAX_INLINED_TEMPS) {
+      bld.indirect_files |= (1 << TGSI_FILE_TEMPORARY);
+   }
+   /*
+    * For performance reason immediates are always backed in a static
+    * array, but if their number is too great, we have to use just
+    * a dynamically allocated array.
+    */
+   bld.use_immediates_array =
+         (params->info->file_max[TGSI_FILE_IMMEDIATE] >= LP_MAX_INLINED_IMMEDIATES);
+   if (bld.use_immediates_array) {
+      bld.indirect_files |= (1 << TGSI_FILE_IMMEDIATE);
+   }
+
 
    bld.bld_base.soa = TRUE;
+   bld.bld_base.emit_debug = emit_debug;
    bld.bld_base.emit_fetch_funcs[TGSI_FILE_CONSTANT] = emit_fetch_constant;
    bld.bld_base.emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate;
    bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input;
@@ -3264,7 +4117,6 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_BGNLOOP].emit = bgnloop_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_BGNSUB].emit = bgnsub_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_BRK].emit = brk_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_BREAKC].emit = breakc_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_CAL].emit = cal_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_CASE].emit = case_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
@@ -3280,17 +4132,22 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_UIF].emit = uif_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_KILL_IF].emit = kill_if_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_KILL].emit = kill_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_NRM].emit = nrm_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_NRM4].emit = nrm_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_RET].emit = ret_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SWITCH].emit = switch_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TEX].emit = tex_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXB].emit = txb_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXD].emit = txd_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXL].emit = txl_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TEX_LZ].emit = txl_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXF_LZ].emit = txf_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TG4].emit = tg4_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_LODQ].emit = lodq_emit;
    /* DX10 sampling ops */
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE].emit = sample_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_B].emit = sample_b_emit;
@@ -3298,30 +4155,50 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_C_LZ].emit = sample_c_lz_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_D].emit = sample_d_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_I].emit = sample_i_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_I_MS].emit = sample_i_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_L].emit = sample_l_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_GATHER4].emit = gather4_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SVIEWINFO].emit = sviewinfo_emit;
-
-   if (gs_iface) {
+   bld.bld_base.op_actions[TGSI_OPCODE_LOD].emit = lod_emit;
+
+   bld.bld_base.op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_STORE].emit = store_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
+
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
+
+   bld.bld_base.op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_BARRIER].emit = barrier_emit;
+
+   if (params->gs_iface) {
       /* There's no specific value for this because it should always
        * be set, but apps using ext_geometry_shader4 quite often
        * were forgetting so we're using MAX_VERTEX_VARYING from
        * that spec even though we could debug_assert if it's not
        * set, but that's a lot uglier. */
-      uint max_output_vertices = 32;
-      uint i = 0;
+      uint max_output_vertices;
+
       /* inputs are always indirect with gs */
       bld.indirect_files |= (1 << TGSI_FILE_INPUT);
-      bld.gs_iface = gs_iface;
+      bld.gs_iface = params->gs_iface;
       bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_gs_input;
       bld.bld_base.op_actions[TGSI_OPCODE_EMIT].emit = emit_vertex;
       bld.bld_base.op_actions[TGSI_OPCODE_ENDPRIM].emit = end_primitive;
 
-      for (i = 0; i < info->num_properties; ++i) {
-         if (info->properties[i].name ==
-             TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
-            max_output_vertices = info->properties[i].data[0];
-         }
-      }
+      max_output_vertices =
+         params->info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+      if (!max_output_vertices)
+         max_output_vertices = 32;
+
       bld.max_output_vertices_vec =
          lp_build_const_int_vec(gallivm, bld.bld_base.int_bld.type,
                                 max_output_vertices);
@@ -3329,7 +4206,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
 
    lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.int_bld);
 
-   bld.system_values = *system_values;
+   bld.system_values = *params->system_values;
 
    lp_build_tgsi_llvm(&bld.bld_base, tokens);
 
@@ -3348,4 +4225,5 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
       LLVMDumpModule(module);
 
    }
+   lp_exec_mask_fini(&bld.exec_mask);
 }