freedreno/ir3: fix register usage calculations

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c

index 2afdd3027e03d638d86fa7500a51a9a529660f51..3b2097f8521bac5cf1c88edfeb5e3c9e1d879884 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1,7 +1,7 @@
  /**************************************************************************
   * 
   * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2007-2008 VMware, Inc.
   * All Rights Reserved.
   * 
   * Permission is hereby granted, free of charge, to any person obtaining a
@@ -19,7 +19,7 @@
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
   * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -47,6 +47,7 @@
  #include "tgsi/tgsi_parse.h"
  #include "tgsi/tgsi_util.h"
  #include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_strings.h"
  #include "lp_bld_tgsi_action.h"
  #include "lp_bld_type.h"
  #include "lp_bld_const.h"
@@ -63,36 +64,171 @@
  #include "lp_bld_debug.h"
  #include "lp_bld_printf.h"
  #include "lp_bld_sample.h"
+#include "lp_bld_struct.h"
  
+/* SM 4.0 says that subroutines can nest 32 deep and 
+ * we need one more for our main function */
+#define LP_MAX_NUM_FUNCS 33
  
-static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
+#define DUMP_GS_EMITS 0
+
+/*
+ * If non-zero, the generated LLVM IR will print intermediate results on every TGSI
+ * instruction.
+ *
+ * TODO:
+ * - take execution masks in consideration
+ * - debug control-flow instructions
+ */
+#define DEBUG_EXECUTION 0
+
+
+/*
+ * Emit code to print a register value.
+ */
+static void
+emit_dump_reg(struct gallivm_state *gallivm,
+              unsigned file,
+              unsigned index,
+              unsigned chan,
+              LLVMValueRef value)
  {
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(bld->gallivm->context);
-   LLVMBuilderRef builder = bld->gallivm->builder;
+   char buf[32];
+
+   util_snprintf(buf, sizeof buf, "    %s[%u].%c = ",
+                 tgsi_file_name(file),
+                 index, "xyzw"[chan]);
+
+   lp_build_print_value(gallivm, buf, value);
+}
+
+/*
+ * Return the context for the current function.
+ * (always 'main', if shader doesn't do any function calls)
+ */
+static INLINE struct function_ctx *
+func_ctx(struct lp_exec_mask *mask)
+{
+   assert(mask->function_stack_size > 0);
+   assert(mask->function_stack_size <= LP_MAX_NUM_FUNCS);
+   return &mask->function_stack[mask->function_stack_size - 1];
+}
+
+/*
+ * Returns true if we're in a loop.
+ * It's global, meaning that it returns true even if there's
+ * no loop inside the current function, but we were inside
+ * a loop inside another function, from which this one was called.
+ */
+static INLINE boolean
+mask_has_loop(struct lp_exec_mask *mask)
+{
+   int i;
+   for (i = mask->function_stack_size - 1; i >= 0; --i) {
+      const struct function_ctx *ctx = &mask->function_stack[i];
+      if (ctx->loop_stack_size > 0)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+/*
+ * Returns true if we're inside a switch statement.
+ * It's global, meaning that it returns true even if there's
+ * no switch in the current function, but we were inside
+ * a switch inside another function, from which this one was called.
+ */
+static INLINE boolean
+mask_has_switch(struct lp_exec_mask *mask)
+{
+   int i;
+   for (i = mask->function_stack_size - 1; i >= 0; --i) {
+      const struct function_ctx *ctx = &mask->function_stack[i];
+      if (ctx->switch_stack_size > 0)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+/*
+ * Returns true if we're inside a conditional.
+ * It's global, meaning that it returns true even if there's
+ * no conditional in the current function, but we were inside
+ * a conditional inside another function, from which this one was called.
+ */
+static INLINE boolean
+mask_has_cond(struct lp_exec_mask *mask)
+{
+   int i;
+   for (i = mask->function_stack_size - 1; i >= 0; --i) {
+      const struct function_ctx *ctx = &mask->function_stack[i];
+      if (ctx->cond_stack_size > 0)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+
+/*
+ * Initialize a function context at the specified index.
+ */
+static void
+lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx)
+{
+   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
+   LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx =  &mask->function_stack[function_idx];
+
+   ctx->cond_stack_size = 0;
+   ctx->loop_stack_size = 0;
+   ctx->switch_stack_size = 0;
+
+   if (function_idx == 0) {
+      ctx->ret_mask = mask->ret_mask;
+   }
+
+   ctx->loop_limiter = lp_build_alloca(mask->bld->gallivm,
+                                       int_type, "looplimiter");
+   LLVMBuildStore(
+      builder,
+      LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false),
+      ctx->loop_limiter);
+}
  
+static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
+{
     mask->bld = bld;
     mask->has_mask = FALSE;
-   mask->cond_stack_size = 0;
-   mask->loop_stack_size = 0;
-   mask->call_stack_size = 0;
+   mask->ret_in_main = FALSE;
+   /* For the main function */
+   mask->function_stack_size = 1;
  
     mask->int_vec_type = lp_build_int_vec_type(bld->gallivm, mask->bld->type);
-   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask =
+   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask =
+         mask->cond_mask = mask->switch_mask =
           LLVMConstAllOnes(mask->int_vec_type);
  
-   mask->loop_limiter = lp_build_alloca(bld->gallivm, int_type, "looplimiter");
+   mask->function_stack = CALLOC(LP_MAX_NUM_FUNCS,
+                                 sizeof(mask->function_stack[0]));
+   lp_exec_mask_function_init(mask, 0);
+}
  
-   LLVMBuildStore(
-      builder,
-      LLVMConstInt(int_type, LP_MAX_TGSI_LOOP_ITERATIONS, false),
-      mask->loop_limiter);
+static void
+lp_exec_mask_fini(struct lp_exec_mask *mask)
+{
+   FREE(mask->function_stack);
  }
  
  static void lp_exec_mask_update(struct lp_exec_mask *mask)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   boolean has_loop_mask = mask_has_loop(mask);
+   boolean has_cond_mask = mask_has_cond(mask);
+   boolean has_switch_mask = mask_has_switch(mask);
+   boolean has_ret_mask = mask->function_stack_size > 1 ||
+         mask->ret_in_main;
  
-   if (mask->loop_stack_size) {
+   if (has_loop_mask) {
        /*for loops we need to update the entire mask at runtime */
        LLVMValueRef tmp;
        assert(mask->break_mask);
@@ -107,28 +243,40 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
     } else
        mask->exec_mask = mask->cond_mask;
  
-   if (mask->call_stack_size) {
+   if (has_switch_mask) {
+      mask->exec_mask = LLVMBuildAnd(builder,
+                                     mask->exec_mask,
+                                     mask->switch_mask,
+                                     "switchmask");
+   }
+
+   if (has_ret_mask) {
        mask->exec_mask = LLVMBuildAnd(builder,
                                       mask->exec_mask,
                                       mask->ret_mask,
                                       "callmask");
     }
  
-   mask->has_mask = (mask->cond_stack_size > 0 ||
-                     mask->loop_stack_size > 0 ||
-                     mask->call_stack_size > 0);
+   mask->has_mask = (has_cond_mask ||
+                     has_loop_mask ||
+                     has_switch_mask ||
+                     has_ret_mask);
  }
  
  static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
                                     LLVMValueRef val)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
  
-   assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING);
-   if (mask->cond_stack_size == 0) {
+   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING) {
+      ctx->cond_stack_size++;
+      return;
+   }
+   if (ctx->cond_stack_size == 0 && mask->function_stack_size == 1) {
        assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
     }
-   mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
+   ctx->cond_stack[ctx->cond_stack_size++] = mask->cond_mask;
     assert(LLVMTypeOf(val) == mask->int_vec_type);
     mask->cond_mask = LLVMBuildAnd(builder,
                                    mask->cond_mask,
@@ -140,12 +288,15 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
  static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
     LLVMValueRef prev_mask;
     LLVMValueRef inv_mask;
  
-   assert(mask->cond_stack_size);
-   prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
-   if (mask->cond_stack_size == 1) {
+   assert(ctx->cond_stack_size);
+   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING)
+      return;
+   prev_mask = ctx->cond_stack[ctx->cond_stack_size - 1];
+   if (ctx->cond_stack_size == 1 && mask->function_stack_size == 1) {
        assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
     }
  
@@ -159,53 +310,118 @@ static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
  
  static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
  {
-   assert(mask->cond_stack_size);
-   mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
+   struct function_ctx *ctx = func_ctx(mask);
+   assert(ctx->cond_stack_size);
+   --ctx->cond_stack_size;
+   if (ctx->cond_stack_size >= LP_MAX_TGSI_NESTING)
+      return;
+   mask->cond_mask = ctx->cond_stack[ctx->cond_stack_size];
     lp_exec_mask_update(mask);
  }
  
  static void lp_exec_bgnloop(struct lp_exec_mask *mask)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
  
-   if (mask->loop_stack_size == 0) {
-      assert(mask->loop_block == NULL);
-      assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type));
-      assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type));
-      assert(mask->break_var == NULL);
+   if (ctx->loop_stack_size >= LP_MAX_TGSI_NESTING) {
+      ++ctx->loop_stack_size;
+      return;
     }
  
-   assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING);
+   ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] =
+      ctx->break_type;
+   ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_LOOP;
  
-   mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block;
-   mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask;
-   mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask;
-   mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var;
-   ++mask->loop_stack_size;
+   ctx->loop_stack[ctx->loop_stack_size].loop_block = ctx->loop_block;
+   ctx->loop_stack[ctx->loop_stack_size].cont_mask = mask->cont_mask;
+   ctx->loop_stack[ctx->loop_stack_size].break_mask = mask->break_mask;
+   ctx->loop_stack[ctx->loop_stack_size].break_var = ctx->break_var;
+   ++ctx->loop_stack_size;
  
-   mask->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, "");
-   LLVMBuildStore(builder, mask->break_mask, mask->break_var);
+   ctx->break_var = lp_build_alloca(mask->bld->gallivm, mask->int_vec_type, "");
+   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
  
-   mask->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop");
+   ctx->loop_block = lp_build_insert_new_block(mask->bld->gallivm, "bgnloop");
  
-   LLVMBuildBr(builder, mask->loop_block);
-   LLVMPositionBuilderAtEnd(builder, mask->loop_block);
+   LLVMBuildBr(builder, ctx->loop_block);
+   LLVMPositionBuilderAtEnd(builder, ctx->loop_block);
  
-   mask->break_mask = LLVMBuildLoad(builder, mask->break_var, "");
+   mask->break_mask = LLVMBuildLoad(builder, ctx->break_var, "");
  
     lp_exec_mask_update(mask);
  }
  
-static void lp_exec_break(struct lp_exec_mask *mask)
+static void lp_exec_break(struct lp_exec_mask *mask,
+                          struct lp_build_tgsi_context * bld_base)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   LLVMValueRef exec_mask = LLVMBuildNot(builder,
+   struct function_ctx *ctx = func_ctx(mask);
+
+   if (ctx->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) {
+      LLVMValueRef exec_mask = LLVMBuildNot(builder,
+                                            mask->exec_mask,
+                                            "break");
+
+      mask->break_mask = LLVMBuildAnd(builder,
+                                      mask->break_mask,
+                                      exec_mask, "break_full");
+   }
+   else {
+      unsigned opcode = bld_base->instructions[bld_base->pc + 1].Instruction.Opcode;
+      boolean break_always = (opcode == TGSI_OPCODE_ENDSWITCH ||
+                              opcode == TGSI_OPCODE_CASE);
+
+
+      if (ctx->switch_in_default) {
+         /*
+          * stop default execution but only if this is an unconditional switch.
+          * (The condition here is not perfect since dead code after break is
+          * allowed but should be sufficient since false negatives are just
+          * unoptimized - so we don't have to pre-evaluate that).
+          */
+         if(break_always && ctx->switch_pc) {
+            bld_base->pc = ctx->switch_pc;
+            return;
+         }
+      }
+
+      if (break_always) {
+         mask->switch_mask = LLVMConstNull(mask->bld->int_vec_type);
+      }
+      else {
+         LLVMValueRef exec_mask = LLVMBuildNot(builder,
+                                               mask->exec_mask,
+                                               "break");
+         mask->switch_mask = LLVMBuildAnd(builder,
+                                          mask->switch_mask,
+                                          exec_mask, "break_switch");
+      }
+   }
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_break_condition(struct lp_exec_mask *mask,
+                                    LLVMValueRef cond)
+{
+   LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
+   LLVMValueRef cond_mask = LLVMBuildAnd(builder,
                                           mask->exec_mask,
-                                         "break");
+                                         cond, "cond_mask");
+   cond_mask = LLVMBuildNot(builder, cond_mask, "break_cond");
  
-   mask->break_mask = LLVMBuildAnd(builder,
-                                   mask->break_mask,
-                                   exec_mask, "break_full");
+   if (ctx->break_type == LP_EXEC_MASK_BREAK_TYPE_LOOP) {
+      mask->break_mask = LLVMBuildAnd(builder,
+                                      mask->break_mask,
+                                      cond_mask, "breakc_full");
+   }
+   else {
+      mask->switch_mask = LLVMBuildAnd(builder,
+                                       mask->switch_mask,
+                                       cond_mask, "breakc_switch");
+   }
  
     lp_exec_mask_update(mask);
  }
@@ -229,6 +445,7 @@ static void lp_exec_endloop(struct gallivm_state *gallivm,
                              struct lp_exec_mask *mask)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
     LLVMBasicBlockRef endloop;
     LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
     LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context,
@@ -238,21 +455,27 @@ static void lp_exec_endloop(struct gallivm_state *gallivm,
  
     assert(mask->break_mask);
  
+   
+   assert(ctx->loop_stack_size);
+   if (ctx->loop_stack_size > LP_MAX_TGSI_NESTING) {
+      --ctx->loop_stack_size;
+      return;
+   }
+
     /*
      * Restore the cont_mask, but don't pop
      */
-   assert(mask->loop_stack_size);
-   mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask;
+   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask;
     lp_exec_mask_update(mask);
  
     /*
      * Unlike the continue mask, the break_mask must be preserved across loop
      * iterations
      */
-   LLVMBuildStore(builder, mask->break_mask, mask->break_var);
+   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
  
     /* Decrement the loop limiter */
-   limiter = LLVMBuildLoad(builder, mask->loop_limiter, "");
+   limiter = LLVMBuildLoad(builder, ctx->loop_limiter, "");
  
     limiter = LLVMBuildSub(
        builder,
@@ -260,21 +483,21 @@ static void lp_exec_endloop(struct gallivm_state *gallivm,
        LLVMConstInt(int_type, 1, false),
        "");
  
-   LLVMBuildStore(builder, limiter, mask->loop_limiter);
+   LLVMBuildStore(builder, limiter, ctx->loop_limiter);
  
     /* i1cond = (mask != 0) */
     i1cond = LLVMBuildICmp(
        builder,
        LLVMIntNE,
        LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""),
-      LLVMConstNull(reg_type), "");
+      LLVMConstNull(reg_type), "i1cond");
  
     /* i2cond = (looplimiter > 0) */
     i2cond = LLVMBuildICmp(
        builder,
        LLVMIntSGT,
        limiter,
-      LLVMConstNull(int_type), "");
+      LLVMConstNull(int_type), "i2cond");
  
     /* if( i1cond && i2cond ) */
     icond = LLVMBuildAnd(builder, i1cond, i2cond, "");
@@ -282,21 +505,248 @@ static void lp_exec_endloop(struct gallivm_state *gallivm,
     endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop");
  
     LLVMBuildCondBr(builder,
-                   icond, mask->loop_block, endloop);
+                   icond, ctx->loop_block, endloop);
  
     LLVMPositionBuilderAtEnd(builder, endloop);
  
-   assert(mask->loop_stack_size);
-   --mask->loop_stack_size;
-   mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block;
-   mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask;
-   mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask;
-   mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var;
+   assert(ctx->loop_stack_size);
+   --ctx->loop_stack_size;
+   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask;
+   mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask;
+   ctx->loop_block = ctx->loop_stack[ctx->loop_stack_size].loop_block;
+   ctx->break_var = ctx->loop_stack[ctx->loop_stack_size].break_var;
+   ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size +
+         ctx->switch_stack_size];
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_switch(struct lp_exec_mask *mask,
+                           LLVMValueRef switchval)
+{
+   struct function_ctx *ctx = func_ctx(mask);
+
+   if (ctx->switch_stack_size >= LP_MAX_TGSI_NESTING ||
+       ctx->loop_stack_size > LP_MAX_TGSI_NESTING) {
+      ctx->switch_stack_size++;
+      return;
+   }
+
+   ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size] =
+      ctx->break_type;
+   ctx->break_type = LP_EXEC_MASK_BREAK_TYPE_SWITCH;
+
+   ctx->switch_stack[ctx->switch_stack_size].switch_mask = mask->switch_mask;
+   ctx->switch_stack[ctx->switch_stack_size].switch_val = ctx->switch_val;
+   ctx->switch_stack[ctx->switch_stack_size].switch_mask_default = ctx->switch_mask_default;
+   ctx->switch_stack[ctx->switch_stack_size].switch_in_default = ctx->switch_in_default;
+   ctx->switch_stack[ctx->switch_stack_size].switch_pc = ctx->switch_pc;
+   ctx->switch_stack_size++;
+
+   mask->switch_mask = LLVMConstNull(mask->int_vec_type);
+   ctx->switch_val = switchval;
+   ctx->switch_mask_default = LLVMConstNull(mask->int_vec_type);
+   ctx->switch_in_default = false;
+   ctx->switch_pc = 0;
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_endswitch(struct lp_exec_mask *mask,
+                              struct lp_build_tgsi_context * bld_base)
+{
+   LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
+
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      ctx->switch_stack_size--;
+      return;
+   }
+
+   /* check if there's deferred default if so do it now */
+   if (ctx->switch_pc && !ctx->switch_in_default) {
+      LLVMValueRef prevmask, defaultmask;
+      unsigned tmp_pc;
+      prevmask = ctx->switch_stack[ctx->switch_stack_size - 1].switch_mask;
+      defaultmask = LLVMBuildNot(builder, ctx->switch_mask_default, "sw_default_mask");
+      mask->switch_mask = LLVMBuildAnd(builder, prevmask, defaultmask, "sw_mask");
+      ctx->switch_in_default = true;
+
+      lp_exec_mask_update(mask);
+
+      assert(bld_base->instructions[ctx->switch_pc - 1].Instruction.Opcode ==
+             TGSI_OPCODE_DEFAULT);
+
+      tmp_pc = bld_base->pc;
+      bld_base->pc = ctx->switch_pc;
+      /*
+       * re-purpose switch_pc to point to here again, since we stop execution of
+       * the deferred default after next break.
+       */
+      ctx->switch_pc = tmp_pc - 1;
+
+      return;
+   }
+
+   else if (ctx->switch_pc && ctx->switch_in_default) {
+      assert(bld_base->pc == ctx->switch_pc + 1);
+   }
+
+   ctx->switch_stack_size--;
+   mask->switch_mask = ctx->switch_stack[ctx->switch_stack_size].switch_mask;
+   ctx->switch_val = ctx->switch_stack[ctx->switch_stack_size].switch_val;
+   ctx->switch_mask_default = ctx->switch_stack[ctx->switch_stack_size].switch_mask_default;
+   ctx->switch_in_default = ctx->switch_stack[ctx->switch_stack_size].switch_in_default;
+   ctx->switch_pc = ctx->switch_stack[ctx->switch_stack_size].switch_pc;
+
+   ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size + ctx->switch_stack_size];
  
     lp_exec_mask_update(mask);
  }
  
-/* stores val into an address pointed to by dst.
+static void lp_exec_case(struct lp_exec_mask *mask,
+                         LLVMValueRef caseval)
+{
+   LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
+
+   LLVMValueRef casemask, prevmask;
+
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      return;
+   }
+
+   /* skipping case mask evaluation here is NOT optional (not in all cases anyway). */
+   if (!ctx->switch_in_default) {
+      prevmask = ctx->switch_stack[ctx->switch_stack_size - 1].switch_mask;
+      casemask = lp_build_cmp(mask->bld, PIPE_FUNC_EQUAL, caseval, ctx->switch_val);
+      ctx->switch_mask_default = LLVMBuildOr(builder, casemask,
+                                             ctx->switch_mask_default, "sw_default_mask");
+      casemask = LLVMBuildOr(builder, casemask, mask->switch_mask, "");
+      mask->switch_mask = LLVMBuildAnd(builder, casemask, prevmask, "sw_mask");
+
+      lp_exec_mask_update(mask);
+   }
+}
+
+/*
+ * Analyse default statement in a switch.
+ * \return true if default is last statement, false otherwise
+ * \param default_pc_start contains pc of instruction to jump to
+ *                         if default wasn't last but there's no
+ *                         fallthrough into default.
+ */
+static boolean default_analyse_is_last(struct lp_exec_mask *mask,
+                                       struct lp_build_tgsi_context * bld_base,
+                                       int *default_pc_start)
+{
+   unsigned pc = bld_base->pc;
+   struct function_ctx *ctx = func_ctx(mask);
+   unsigned curr_switch_stack = ctx->switch_stack_size;
+
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      return false;
+   }
+
+   /* skip over case statements which are together with default */
+   while (bld_base->instructions[pc].Instruction.Opcode == TGSI_OPCODE_CASE) {
+      pc++;
+   }
+
+   while (pc != -1 && pc < bld_base->num_instructions) {
+      unsigned opcode = bld_base->instructions[pc].Instruction.Opcode;
+      switch (opcode) {
+      case TGSI_OPCODE_CASE:
+         if (curr_switch_stack == ctx->switch_stack_size) {
+            *default_pc_start = pc - 1;
+            return false;
+         }
+         break;
+      case TGSI_OPCODE_SWITCH:
+         curr_switch_stack++;
+         break;
+      case TGSI_OPCODE_ENDSWITCH:
+         if (curr_switch_stack == ctx->switch_stack_size) {
+            *default_pc_start = pc - 1;
+            return true;
+         }
+         curr_switch_stack--;
+         break;
+      }
+      pc++;
+   }
+   /* should never arrive here */
+   assert(0);
+   return true;
+}
+
+static void lp_exec_default(struct lp_exec_mask *mask,
+                            struct lp_build_tgsi_context * bld_base)
+{
+   LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
+
+   int default_exec_pc;
+   boolean default_is_last;
+
+   if (ctx->switch_stack_size > LP_MAX_TGSI_NESTING) {
+      return;
+   }
+
+   /*
+    * This is a messy opcode, because it may not be always at the end and
+    * there can be fallthrough in and out of it.
+    */
+
+   default_is_last = default_analyse_is_last(mask, bld_base, &default_exec_pc);
+   /*
+    * If it is last statement in switch (note that case statements appearing
+    * "at the same time" as default don't change that) everything is just fine,
+    * update switch mask and go on. This means we can handle default with
+    * fallthrough INTO it without overhead, if it is last.
+    */
+   if (default_is_last) {
+      LLVMValueRef prevmask, defaultmask;
+      prevmask = ctx->switch_stack[ctx->switch_stack_size - 1].switch_mask;
+      defaultmask = LLVMBuildNot(builder, ctx->switch_mask_default, "sw_default_mask");
+      defaultmask = LLVMBuildOr(builder, defaultmask, mask->switch_mask, "");
+      mask->switch_mask = LLVMBuildAnd(builder, prevmask, defaultmask, "sw_mask");
+      ctx->switch_in_default = true;
+
+      lp_exec_mask_update(mask);
+   }
+   else {
+      /*
+       * Technically, "case" immediately before default isn't really a
+       * fallthrough, however we still have to count them as such as we
+       * already have updated the masks.
+       * If that happens in practice could add a switch optimizer pass
+       * which just gets rid of all case statements appearing together with
+       * default (or could do switch analysis at switch start time instead).
+       */
+      unsigned opcode = bld_base->instructions[bld_base->pc - 1].Instruction.Opcode;
+      boolean ft_into = (opcode != TGSI_OPCODE_BRK &&
+                         opcode != TGSI_OPCODE_SWITCH);
+      /*
+       * If it is not last statement and there was no fallthrough into it,
+       * we record the PC and continue execution at next case (again, those
+       * case encountered at the same time don't count). At endswitch
+       * time, we update switchmask, and go back executing the code we skipped
+       * until the next break (possibly re-executing some code with changed mask
+       * if there was a fallthrough out of default).
+       * Finally, if it is not last statement and there was a fallthrough into it,
+       * do the same as with the former case, except instead of skipping the code
+       * just execute it without updating the mask, then go back and re-execute.
+       */
+      ctx->switch_pc = bld_base->pc;
+      if (!ft_into) {
+         bld_base->pc = default_exec_pc;
+      }
+   }
+}
+
+
+/* stores val into an address pointed to by dst_ptr.
   * mask->exec_mask is used to figure out which bits of val
   * should be stored into the address
   * (0 means don't store this bit, 1 means do store).
@@ -305,10 +755,14 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask,
                                 struct lp_build_context *bld_store,
                                 LLVMValueRef pred,
                                 LLVMValueRef val,
-                               LLVMValueRef dst)
+                               LLVMValueRef dst_ptr)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
  
+   assert(lp_check_value(bld_store->type, val));
+   assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
+   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
+
     /* Mix the predicate and execution mask */
     if (mask->has_mask) {
        if (pred) {
@@ -319,39 +773,54 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask,
     }
  
     if (pred) {
-      LLVMValueRef real_val, dst_val;
+      LLVMValueRef res, dst;
  
-      dst_val = LLVMBuildLoad(builder, dst, "");
-      real_val = lp_build_select(bld_store,
-                                 pred,
-                                 val, dst_val);
-
-      LLVMBuildStore(builder, real_val, dst);
+      dst = LLVMBuildLoad(builder, dst_ptr, "");
+      res = lp_build_select(bld_store, pred, val, dst);
+      LLVMBuildStore(builder, res, dst_ptr);
     } else
-      LLVMBuildStore(builder, val, dst);
+      LLVMBuildStore(builder, val, dst_ptr);
  }
  
  static void lp_exec_mask_call(struct lp_exec_mask *mask,
                                int func,
                                int *pc)
  {
-   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
-   mask->call_stack[mask->call_stack_size].pc = *pc;
-   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
-   mask->call_stack_size++;
+   if (mask->function_stack_size >= LP_MAX_NUM_FUNCS) {
+      return;
+   }
+
+   lp_exec_mask_function_init(mask, mask->function_stack_size);
+   mask->function_stack[mask->function_stack_size].pc = *pc;
+   mask->function_stack[mask->function_stack_size].ret_mask = mask->ret_mask;
+   mask->function_stack_size++;
     *pc = func;
  }
  
  static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
  {
     LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(mask);
     LLVMValueRef exec_mask;
  
-   if (mask->call_stack_size == 0) {
+   if (ctx->cond_stack_size == 0 &&
+       ctx->loop_stack_size == 0 &&
+       ctx->switch_stack_size == 0 &&
+       mask->function_stack_size == 1) {
        /* returning from main() */
        *pc = -1;
        return;
     }
+
+   if (mask->function_stack_size == 1) {
+      /*
+       * This requires special handling since we need to ensure
+       * we don't drop the mask even if we have no call stack
+       * (e.g. after a ret in a if clause after the endif)
+       */
+      mask->ret_in_main = TRUE;
+   }
+
     exec_mask = LLVMBuildNot(builder,
                              mask->exec_mask,
                              "ret");
@@ -369,14 +838,58 @@ static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
  
  static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
  {
-   assert(mask->call_stack_size);
-   mask->call_stack_size--;
-   *pc = mask->call_stack[mask->call_stack_size].pc;
-   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
+   struct function_ctx *ctx;
+
+   assert(mask->function_stack_size > 1);
+   assert(mask->function_stack_size <= LP_MAX_NUM_FUNCS);
+
+   ctx = func_ctx(mask);
+   mask->function_stack_size--;
+
+   *pc = ctx->pc;
+   mask->ret_mask = ctx->ret_mask;
+
     lp_exec_mask_update(mask);
  }
  
  
+static LLVMValueRef
+get_file_ptr(struct lp_build_tgsi_soa_context *bld,
+             unsigned file,
+             unsigned index,
+             unsigned chan)
+{
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   LLVMValueRef (*array_of_vars)[TGSI_NUM_CHANNELS];
+   LLVMValueRef var_of_array;
+
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+      array_of_vars = bld->temps;
+      var_of_array = bld->temps_array;
+      break;
+   case TGSI_FILE_OUTPUT:
+      array_of_vars = bld->outputs;
+      var_of_array = bld->outputs_array;
+      break;
+   default:
+      assert(0);
+      return NULL;
+   }
+
+   assert(chan < 4);
+
+   if (bld->indirect_files & (1 << file)) {
+      LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan);
+      return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+   }
+   else {
+      assert(index <= bld->bld_base.info->file_max[file]);
+      return array_of_vars[index][chan];
+   }
+}
+
+
  /**
   * Return pointer to a temporary register channel (src or dest).
   * Note that indirect addressing cannot be handled here.
@@ -388,15 +901,7 @@ lp_get_temp_ptr_soa(struct lp_build_tgsi_soa_context *bld,
               unsigned index,
               unsigned chan)
  {
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   assert(chan < 4);
-   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
-      LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan);
-      return LLVMBuildGEP(builder, bld->temps_array, &lindex, 1, "");
-   }
-   else {
-      return bld->temps[index][chan];
-   }
+   return get_file_ptr(bld, TGSI_FILE_TEMPORARY, index, chan);
  }
  
  /**
@@ -410,15 +915,26 @@ lp_get_output_ptr(struct lp_build_tgsi_soa_context *bld,
                 unsigned index,
                 unsigned chan)
  {
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   assert(chan < 4);
-   if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
-      LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm,
-                                                 index * 4 + chan);
-      return LLVMBuildGEP(builder, bld->outputs_array, &lindex, 1, "");
-   }
-   else {
-      return bld->outputs[index][chan];
+   return get_file_ptr(bld, TGSI_FILE_OUTPUT, index, chan);
+}
+
+/*
+ * If we have indirect addressing in outputs copy our alloca array
+ * to the outputs slots specified by the caller to make sure
+ * our outputs are delivered consistently via the same interface.
+ */
+static void
+gather_outputs(struct lp_build_tgsi_soa_context * bld)
+{
+   if ((bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+      unsigned index, chan;
+      assert(bld->bld_base.info->num_outputs <=
+             bld->bld_base.info->file_max[TGSI_FILE_OUTPUT] + 1);
+      for (index = 0; index < bld->bld_base.info->num_outputs; ++index) {
+         for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+            bld->outputs[index][chan] = lp_get_output_ptr(bld, index, chan);
+         }
+      }
     }
  }
  
@@ -430,11 +946,19 @@ lp_get_output_ptr(struct lp_build_tgsi_soa_context *bld,
  static LLVMValueRef
  build_gather(struct lp_build_context *bld,
               LLVMValueRef base_ptr,
-             LLVMValueRef indexes)
+             LLVMValueRef indexes,
+             LLVMValueRef *overflow_mask)
  {
     LLVMBuilderRef builder = bld->gallivm->builder;
     LLVMValueRef res = bld->undef;
     unsigned i;
+   LLVMValueRef temp_ptr = NULL;
+
+   if (overflow_mask) {
+      temp_ptr = lp_build_alloca(
+         bld->gallivm,
+         lp_build_vec_type(bld->gallivm, bld->type), "");
+   }
  
     /*
      * Loop over elements of index_vec, load scalar value, insert it into 'res'.
@@ -443,11 +967,54 @@ build_gather(struct lp_build_context *bld,
        LLVMValueRef ii = lp_build_const_int32(bld->gallivm, i);
        LLVMValueRef index = LLVMBuildExtractElement(builder,
                                                     indexes, ii, "");
-      LLVMValueRef scalar_ptr = LLVMBuildGEP(builder, base_ptr,
-                                             &index, 1, "gather_ptr");
-      LLVMValueRef scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+      LLVMValueRef scalar_ptr, scalar;
+      LLVMValueRef overflow;
+      struct lp_build_if_state if_ctx;
  
-      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
+      /*
+       * overflow_mask is a boolean vector telling us which channels
+       * in the vector overflowed. We use the overflow behavior for
+       * constant buffers which is defined as:
+       * Out of bounds access to constant buffer returns 0 in all
+       * componenets. Out of bounds behavior is always with respect
+       * to the size of the buffer bound at that slot.
+       */
+      if (overflow_mask) {
+         overflow = LLVMBuildExtractElement(builder, *overflow_mask,
+                                            ii, "");
+         lp_build_if(&if_ctx, bld->gallivm, overflow);
+         {
+            LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
+            val = LLVMBuildInsertElement(
+               builder, val,
+               LLVMConstNull(LLVMFloatTypeInContext(bld->gallivm->context)),
+               ii, "");
+            LLVMBuildStore(builder, val, temp_ptr);
+         }
+         lp_build_else(&if_ctx);
+         {
+            LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
+
+            scalar_ptr = LLVMBuildGEP(builder, base_ptr,
+                                      &index, 1, "gather_ptr");
+            scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+
+            val = LLVMBuildInsertElement(builder, val, scalar, ii, "");
+
+            LLVMBuildStore(builder, val, temp_ptr);
+         }
+         lp_build_endif(&if_ctx);
+      } else {
+         scalar_ptr = LLVMBuildGEP(builder, base_ptr,
+                                   &index, 1, "gather_ptr");
+         scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+
+         res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
+      }
+   }
+
+   if (overflow_mask) {
+      res = LLVMBuildLoad(builder, temp_ptr, "gather_val");
     }
  
     return res;
@@ -516,12 +1083,12 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
  static LLVMValueRef
  get_indirect_index(struct lp_build_tgsi_soa_context *bld,
                     unsigned reg_file, unsigned reg_index,
-                   const struct tgsi_src_register *indirect_reg)
+                   const struct tgsi_ind_register *indirect_reg)
  {
     LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
     struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
     /* always use X component of address register */
-   unsigned swizzle = indirect_reg->SwizzleX;
+   unsigned swizzle = indirect_reg->Swizzle;
     LLVMValueRef base;
     LLVMValueRef rel;
     LLVMValueRef max_index;
@@ -532,18 +1099,44 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
     base = lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type, reg_index);
  
     assert(swizzle < 4);
-   rel = LLVMBuildLoad(builder,
-                        bld->addr[indirect_reg->Index][swizzle],
-                        "load addr reg");
-
-   index = lp_build_add(uint_bld, base, rel);
-
-   max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
-                                      uint_bld->type,
-                                      bld->bld_base.info->file_max[reg_file]);
+   switch (indirect_reg->File) {
+   case TGSI_FILE_ADDRESS:
+      rel = LLVMBuildLoad(builder,
+                          bld->addr[indirect_reg->Index][swizzle],
+                          "load addr reg");
+      /* ADDR LLVM values already have LLVM integer type. */
+      break;
+   case TGSI_FILE_TEMPORARY:
+      rel = lp_get_temp_ptr_soa(bld, indirect_reg->Index, swizzle);
+      rel = LLVMBuildLoad(builder, rel, "load temp reg");
+      /* TEMP LLVM values always have LLVM float type, but for indirection, the
+       * value actually stored is expected to be an integer */
+      rel = LLVMBuildBitCast(builder, rel, uint_bld->vec_type, "");
+      break;
+   default:
+      assert(0);
+      rel = uint_bld->zero;
+   }
+
+   index = lp_build_add(uint_bld, base, rel);
+
+   /*
+    * emit_fetch_constant handles constant buffer overflow so this code
+    * is pointless for them.
+    * Furthermore the D3D10 spec in section 6.5 says:
+    * If the constant buffer bound to a slot is larger than the size
+    * declared in the shader for that slot, implementations are allowed
+    * to return incorrect data (not necessarily 0) for indices that are
+    * larger than the declared size but smaller than the buffer size.
+    */
+   if (reg_file != TGSI_FILE_CONSTANT) {
+      max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
+                                         uint_bld->type,
+                                         bld->bld_base.info->file_max[reg_file]);
  
-   assert(!uint_bld->type.sign);
-   index = lp_build_min(uint_bld, index, max_index);
+      assert(!uint_bld->type.sign);
+      index = lp_build_min(uint_bld, index, max_index);
+   }
  
     return index;
  }
@@ -575,6 +1168,39 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
     return bld_fetch;
  }
  
+static LLVMValueRef
+get_soa_array_offsets(struct lp_build_context *uint_bld,
+                      LLVMValueRef indirect_index,
+                      unsigned chan_index,
+                      boolean need_perelement_offset)
+{
+   struct gallivm_state *gallivm = uint_bld->gallivm;
+   LLVMValueRef chan_vec =
+      lp_build_const_int_vec(uint_bld->gallivm, uint_bld->type, chan_index);
+   LLVMValueRef length_vec =
+      lp_build_const_int_vec(gallivm, uint_bld->type, uint_bld->type.length);
+   LLVMValueRef index_vec;
+
+   /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */
+   index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+   index_vec = lp_build_add(uint_bld, index_vec, chan_vec);
+   index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+
+   if (need_perelement_offset) {
+      LLVMValueRef pixel_offsets;
+      int i;
+     /* build pixel offset vector: {0, 1, 2, 3, ...} */
+      pixel_offsets = uint_bld->undef;
+      for (i = 0; i < uint_bld->type.length; i++) {
+         LLVMValueRef ii = lp_build_const_int32(gallivm, i);
+         pixel_offsets = LLVMBuildInsertElement(gallivm->builder, pixel_offsets,
+                                                ii, ii, "");
+      }
+      index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
+   }
+   return index_vec;
+}
+
  static LLVMValueRef
  emit_fetch_constant(
     struct lp_build_tgsi_context * bld_base,
@@ -586,50 +1212,71 @@ emit_fetch_constant(
     struct gallivm_state *gallivm = bld_base->base.gallivm;
     LLVMBuilderRef builder = gallivm->builder;
     struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   LLVMValueRef indirect_index = NULL;
-   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
-   
+   unsigned dimension = 0;
+   LLVMValueRef consts_ptr;
+   LLVMValueRef num_consts;
+   LLVMValueRef res;
+
     /* XXX: Handle fetching xyzw components as a vector */
     assert(swizzle != ~0);
  
+   if (reg->Register.Dimension) {
+      assert(!reg->Dimension.Indirect);
+      dimension = reg->Dimension.Index;
+      assert(dimension < LP_MAX_TGSI_CONST_BUFFERS);
+   }
+
+   consts_ptr = bld->consts[dimension];
+   num_consts = bld->consts_sizes[dimension];
+
     if (reg->Register.Indirect) {
+      LLVMValueRef indirect_index;
+      LLVMValueRef swizzle_vec =
+         lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
+      LLVMValueRef index_vec;  /* index into the const buffer */
+      LLVMValueRef overflow_mask;
+
        indirect_index = get_indirect_index(bld,
                                            reg->Register.File,
                                            reg->Register.Index,
                                            &reg->Indirect);
-   }
  
-   if (reg->Register.Indirect) {
-      LLVMValueRef swizzle_vec =
-         lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type, swizzle);
-      LLVMValueRef index_vec;  /* index into the const buffer */
+      /* All fetches are from the same constant buffer, so
+       * we need to propagate the size to a vector to do a
+       * vector comparison */
+      num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
+      /* Construct a boolean vector telling us which channels
+       * overflow the bound constant buffer */
+      overflow_mask = LLVMBuildICmp(builder, LLVMIntUGE,
+                                    indirect_index,
+                                    num_consts, "");
  
        /* index_vec = indirect_index * 4 + swizzle */
        index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
        index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
  
        /* Gather values from the constant buffer */
-      return build_gather(bld_fetch, bld->consts_ptr, index_vec);
+      res = build_gather(&bld_base->base, consts_ptr, index_vec,
+                         &overflow_mask);
     }
     else {
        LLVMValueRef index;  /* index into the const buffer */
        LLVMValueRef scalar, scalar_ptr;
  
-      index = lp_build_const_int32(gallivm, reg->Register.Index*4 + swizzle);
-
-      scalar_ptr = LLVMBuildGEP(builder, bld->consts_ptr,
-                                   &index, 1, "");
+      index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
  
-      if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
-         LLVMTypeRef ivtype = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
-         LLVMValueRef temp_ptr;
-         temp_ptr = LLVMBuildBitCast(builder, scalar_ptr, ivtype, "");
-         scalar = LLVMBuildLoad(builder, temp_ptr, "");
-      } else
-         scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+      scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
+                                &index, 1, "");
+      scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
+   }
  
-      return lp_build_broadcast_scalar(bld_fetch, scalar);
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
     }
+
+   return res;
  }
  
  static LLVMValueRef
@@ -640,13 +1287,55 @@ emit_fetch_immediate(
     unsigned swizzle)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-   LLVMValueRef res = bld->immediates[reg->Register.Index][swizzle];
-   assert(res);
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef res = NULL;
+
+   if (bld->use_immediates_array || reg->Register.Indirect) {
+      LLVMValueRef imms_array;
+      LLVMTypeRef fptr_type;
+
+      /* cast imms_array pointer to float* */
+      fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
+      imms_array = LLVMBuildBitCast(builder, bld->imms_array, fptr_type, "");
+
+      if (reg->Register.Indirect) {
+         LLVMValueRef indirect_index;
+         LLVMValueRef index_vec;  /* index into the immediate register array */
+
+         indirect_index = get_indirect_index(bld,
+                                             reg->Register.File,
+                                             reg->Register.Index,
+                                             &reg->Indirect);
+         /*
+          * Unlike for other reg classes, adding pixel offsets is unnecessary -
+          * immediates are stored as full vectors (FIXME??? - might be better
+          * to store them the same as constants) but all elements are the same
+          * in any case.
+          */
+         index_vec = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           swizzle,
+                                           FALSE);
+
+         /* Gather values from the immediate register array */
+         res = build_gather(&bld_base->base, imms_array, index_vec, NULL);
+      } else {
+         LLVMValueRef lindex = lp_build_const_int32(gallivm,
+                                        reg->Register.Index * 4 + swizzle);
+         LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
+                                                bld->imms_array, &lindex, 1, "");
+         res = LLVMBuildLoad(builder, imms_ptr, "");
+      }
+   }
+   else {
+      res = bld->immediates[reg->Register.Index][swizzle];
+   }
  
     if (stype == TGSI_TYPE_UNSIGNED) {
-      res = LLVMConstBitCast(res, bld_base->uint_bld.vec_type);
+      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
     } else if (stype == TGSI_TYPE_SIGNED) {
-      res = LLVMConstBitCast(res, bld_base->int_bld.vec_type);
+      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
     }
     return res;
  }
@@ -661,38 +1350,30 @@ emit_fetch_input(
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
     struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
     LLVMBuilderRef builder = gallivm->builder;
-   struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   LLVMValueRef indirect_index = NULL;
     LLVMValueRef res;
  
     if (reg->Register.Indirect) {
+      LLVMValueRef indirect_index;
+      LLVMValueRef index_vec;  /* index into the input reg array */
+      LLVMValueRef inputs_array;
+      LLVMTypeRef fptr_type;
+
        indirect_index = get_indirect_index(bld,
                                            reg->Register.File,
                                            reg->Register.Index,
                                            &reg->Indirect);
-   }
  
-   if (reg->Register.Indirect) {
-      LLVMValueRef swizzle_vec =
-         lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
-      LLVMValueRef length_vec =
-         lp_build_const_int_vec(gallivm, uint_bld->type, bld->bld_base.base.type.length);
-      LLVMValueRef index_vec;  /* index into the const buffer */
-      LLVMValueRef inputs_array;
-      LLVMTypeRef float4_ptr_type;
-
-      /* index_vec = (indirect_index * 4 + swizzle) * length */
-      index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
-      index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
-      index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+      index_vec = get_soa_array_offsets(&bld_base->uint_bld,
+                                        indirect_index,
+                                        swizzle,
+                                        TRUE);
  
        /* cast inputs_array pointer to float* */
-      float4_ptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
-      inputs_array = LLVMBuildBitCast(builder, bld->inputs_array,
-                                         float4_ptr_type, "");
+      fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
+      inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
  
-      /* Gather values from the temporary register array */
-      res = build_gather(&bld_base->base, inputs_array, index_vec);
+      /* Gather values from the input register array */
+      res = build_gather(&bld_base->base, inputs_array, index_vec, NULL);
     } else {
        if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
           LLVMValueRef lindex = lp_build_const_int32(gallivm,
@@ -717,6 +1398,70 @@ emit_fetch_input(
     return res;
  }
  
+
+static LLVMValueRef
+emit_fetch_gs_input(
+   struct lp_build_tgsi_context * bld_base,
+   const struct tgsi_full_src_register * reg,
+   enum tgsi_opcode_type stype,
+   unsigned swizzle)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   const struct tgsi_shader_info *info = bld->bld_base.info;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef attrib_index = NULL;
+   LLVMValueRef vertex_index = NULL;
+   LLVMValueRef swizzle_index = lp_build_const_int32(gallivm, swizzle);
+   LLVMValueRef res;
+
+   if (info->input_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_PRIMID) {
+      /* This is really a system value not a regular input */
+      assert(!reg->Register.Indirect);
+      assert(!reg->Dimension.Indirect);
+      res = bld->system_values.prim_id;
+      if (stype != TGSI_TYPE_UNSIGNED && stype != TGSI_TYPE_SIGNED) {
+         res = LLVMBuildBitCast(builder, res, bld_base->base.vec_type, "");
+      }
+      return res;
+   }
+
+   if (reg->Register.Indirect) {
+      attrib_index = get_indirect_index(bld,
+                                        reg->Register.File,
+                                        reg->Register.Index,
+                                        &reg->Indirect);
+   } else {
+      attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
+   }
+   
+   if (reg->Dimension.Indirect) {
+      vertex_index = get_indirect_index(bld,
+                                        reg->Register.File,
+                                        reg->Dimension.Index,
+                                        &reg->DimIndirect);
+   } else {
+      vertex_index = lp_build_const_int32(gallivm, reg->Dimension.Index);
+   }
+
+   res = bld->gs_iface->fetch_input(bld->gs_iface, bld_base,
+                                    reg->Dimension.Indirect,
+                                    vertex_index,
+                                    reg->Register.Indirect,
+                                    attrib_index,
+                                    swizzle_index);
+
+   assert(res);
+
+   if (stype == TGSI_TYPE_UNSIGNED) {
+      res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_SIGNED) {
+      res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   }
+
+   return res;
+}
+
  static LLVMValueRef
  emit_fetch_temporary(
     struct lp_build_tgsi_context * bld_base,
@@ -727,52 +1472,40 @@ emit_fetch_temporary(
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
     struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
     LLVMBuilderRef builder = gallivm->builder;
-   struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   LLVMValueRef indirect_index = NULL;
     LLVMValueRef res;
  
     if (reg->Register.Indirect) {
+      LLVMValueRef indirect_index;
+      LLVMValueRef index_vec;  /* index into the temp reg array */
+      LLVMValueRef temps_array;
+      LLVMTypeRef fptr_type;
+
        indirect_index = get_indirect_index(bld,
                                            reg->Register.File,
                                            reg->Register.Index,
                                            &reg->Indirect);
-   }
-
-   if (reg->Register.Indirect) {
-      LLVMValueRef swizzle_vec =
-         lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type, swizzle);
-      LLVMValueRef length_vec =
-         lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type,
-                                bld->bld_base.base.type.length);
-      LLVMValueRef index_vec;  /* index into the const buffer */
-      LLVMValueRef temps_array;
-      LLVMTypeRef float4_ptr_type;
  
-      /* index_vec = (indirect_index * 4 + swizzle) * length */
-      index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
-      index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
-      index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+      index_vec = get_soa_array_offsets(&bld_base->uint_bld,
+                                        indirect_index,
+                                        swizzle,
+                                        TRUE);
  
        /* cast temps_array pointer to float* */
-      float4_ptr_type = LLVMPointerType(LLVMFloatTypeInContext(bld->bld_base.base.gallivm->context), 0);
-      temps_array = LLVMBuildBitCast(builder, bld->temps_array,
-                                     float4_ptr_type, "");
+      fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
+      temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
  
        /* Gather values from the temporary register array */
-      res = build_gather(&bld_base->base, temps_array, index_vec);
+      res = build_gather(&bld_base->base, temps_array, index_vec, NULL);
     }
     else {
        LLVMValueRef temp_ptr;
-      if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
-         LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
-         LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
-                                                     swizzle);
-         temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
-      } else
-         temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
+      temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
        res = LLVMBuildLoad(builder, temp_ptr, "");
-      if (!res)
-         return bld->bld_base.base.undef;
+   }
+
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+      struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+      res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
     }
  
     return res;
@@ -805,6 +1538,21 @@ emit_fetch_system_value(
        atype = TGSI_TYPE_UNSIGNED;
        break;
  
+   case TGSI_SEMANTIC_VERTEXID_NOBASE:
+      res = bld->system_values.vertex_id_nobase;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_BASEVERTEX:
+      res = bld->system_values.basevertex;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
+   case TGSI_SEMANTIC_PRIMID:
+      res = bld->system_values.prim_id;
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
     default:
        assert(!"unexpected semantic in emit_fetch_system_value");
        res = bld_base->base.zero;
@@ -916,6 +1664,7 @@ emit_fetch_predicate(
     }
  }
  
+
  /**
   * Register store.
   */
@@ -929,45 +1678,39 @@ emit_store_chan(
     LLVMValueRef value)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
     LLVMBuilderRef builder = gallivm->builder;
     const struct tgsi_full_dst_register *reg = &inst->Dst[index];
-   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   struct lp_build_context *float_bld = &bld_base->base;
+   struct lp_build_context *int_bld = &bld_base->int_bld;
     LLVMValueRef indirect_index = NULL;
-   struct lp_build_context *bld_store;
     enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
  
-   switch (dtype) {
-   default:
-   case TGSI_TYPE_FLOAT:
-   case TGSI_TYPE_UNTYPED:
-      bld_store = &bld_base->base;
-      break;
-   case TGSI_TYPE_UNSIGNED:
-      bld_store = &bld_base->uint_bld;
-      break;
-   case TGSI_TYPE_SIGNED:
-      bld_store = &bld_base->int_bld;
-      break;
-   case TGSI_TYPE_DOUBLE:
-   case TGSI_TYPE_VOID:
-      assert(0);
-      bld_store = NULL;
-      break;
-   }
-
+   /*
+    * Apply saturation.
+    *
+    * It is always assumed to be float.
+    */
     switch( inst->Instruction.Saturate ) {
     case TGSI_SAT_NONE:
        break;
  
     case TGSI_SAT_ZERO_ONE:
-      value = lp_build_max(&bld->bld_base.base, value, bld->bld_base.base.zero);
-      value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one);
+      assert(dtype == TGSI_TYPE_FLOAT ||
+             dtype == TGSI_TYPE_UNTYPED);
+      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      value = lp_build_clamp_zero_one_nanzero(float_bld, value);
        break;
  
     case TGSI_SAT_MINUS_PLUS_ONE:
-      value = lp_build_max(&bld->bld_base.base, value, lp_build_const_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, -1.0));
-      value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one);
+      assert(dtype == TGSI_TYPE_FLOAT ||
+             dtype == TGSI_TYPE_UNTYPED);
+      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      /* This will give -1.0 for NaN which is probably not what we want. */
+      value = lp_build_max_ext(float_bld, value,
+                               lp_build_const_vec(gallivm, float_bld->type, -1.0),
+                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+      value = lp_build_min(float_bld, value, float_bld->one);
        break;
  
     default:
@@ -981,83 +1724,58 @@ emit_store_chan(
                                            &reg->Indirect);
     } else {
        assert(reg->Register.Index <=
-                             bld->bld_base.info->file_max[reg->Register.File]);
+                             bld_base->info->file_max[reg->Register.File]);
+   }
+
+   if (DEBUG_EXECUTION) {
+      emit_dump_reg(gallivm, reg->Register.File, reg->Register.Index, chan_index, value);
     }
  
     switch( reg->Register.File ) {
     case TGSI_FILE_OUTPUT:
+      /* Outputs are always stored as floats */
+      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+
        if (reg->Register.Indirect) {
-         LLVMValueRef chan_vec =
-            lp_build_const_int_vec(gallivm, uint_bld->type, chan_index);
-         LLVMValueRef length_vec =
-            lp_build_const_int_vec(gallivm, uint_bld->type, bld->bld_base.base.type.length);
-         LLVMValueRef index_vec;  /* indexes into the temp registers */
+         LLVMValueRef index_vec;  /* indexes into the output registers */
           LLVMValueRef outputs_array;
-         LLVMValueRef pixel_offsets;
-         LLVMTypeRef float_ptr_type;
-         int i;
-
-         /* build pixel offset vector: {0, 1, 2, 3, ...} */
-         pixel_offsets = uint_bld->undef;
-         for (i = 0; i < bld->bld_base.base.type.length; i++) {
-            LLVMValueRef ii = lp_build_const_int32(gallivm, i);
-            pixel_offsets = LLVMBuildInsertElement(builder, pixel_offsets,
-                                                   ii, ii, "");
-         }
+         LLVMTypeRef fptr_type;
  
-         /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */
-         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
-         index_vec = lp_build_add(uint_bld, index_vec, chan_vec);
-         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
-         index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
+         index_vec = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           chan_index,
+                                           TRUE);
  
-         float_ptr_type =
-            LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
-         outputs_array = LLVMBuildBitCast(builder, bld->outputs_array,
-                                          float_ptr_type, "");
+         fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
+         outputs_array = LLVMBuildBitCast(builder, bld->outputs_array, fptr_type, "");
  
-         /* Scatter store values into temp registers */
+         /* Scatter store values into output registers */
           emit_mask_scatter(bld, outputs_array, index_vec, value,
                             &bld->exec_mask, pred);
        }
        else {
           LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
-                                               chan_index);
-         lp_exec_mask_store(&bld->exec_mask, bld_store, pred, value, out_ptr);
+                                                  chan_index);
+         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
        }
        break;
  
     case TGSI_FILE_TEMPORARY:
+      /* Temporaries are always stored as floats */
+      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+
        if (reg->Register.Indirect) {
-         LLVMValueRef chan_vec =
-            lp_build_const_int_vec(gallivm, uint_bld->type, chan_index);
-         LLVMValueRef length_vec =
-            lp_build_const_int_vec(gallivm, uint_bld->type,
-                                   bld->bld_base.base.type.length);
           LLVMValueRef index_vec;  /* indexes into the temp registers */
           LLVMValueRef temps_array;
-         LLVMValueRef pixel_offsets;
-         LLVMTypeRef float_ptr_type;
-         int i;
-
-         /* build pixel offset vector: {0, 1, 2, 3, ...} */
-         pixel_offsets = uint_bld->undef; 
-         for (i = 0; i < bld->bld_base.base.type.length; i++) {
-            LLVMValueRef ii = lp_build_const_int32(gallivm, i);
-            pixel_offsets = LLVMBuildInsertElement(builder, pixel_offsets,
-                                                   ii, ii, "");
-         }
+         LLVMTypeRef fptr_type;
  
-         /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */
-         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
-         index_vec = lp_build_add(uint_bld, index_vec, chan_vec);
-         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
-         index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
+         index_vec = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           chan_index,
+                                           TRUE);
  
-         float_ptr_type =
-            LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
-         temps_array = LLVMBuildBitCast(builder, bld->temps_array,
-                                        float_ptr_type, "");
+         fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
+         temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
  
           /* Scatter store values into temp registers */
           emit_mask_scatter(bld, temps_array, index_vec, value,
@@ -1065,48 +1783,64 @@ emit_store_chan(
        }
        else {
           LLVMValueRef temp_ptr;
-
-         switch (dtype) {
-         case TGSI_TYPE_UNSIGNED:
-         case TGSI_TYPE_SIGNED: {
-            LLVMTypeRef itype = bld_base->int_bld.vec_type;
-            LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
-            LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
-                                                        chan_index);
-            LLVMValueRef temp_value_ptr;
-
-            temp_ptr = LLVMBuildBitCast(builder, tint_ptr, ivtype, "");
-            temp_value_ptr = LLVMBuildBitCast(builder, value, itype, "");
-            value = temp_value_ptr;
-            break;
-         }
-         default:
-         case TGSI_TYPE_FLOAT:
-         case TGSI_TYPE_UNTYPED:
-            temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
-                                           chan_index);
-            break;
-         }
-
-         lp_exec_mask_store(&bld->exec_mask, bld_store, pred, value, temp_ptr);
+         temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
+         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
        }
        break;
  
     case TGSI_FILE_ADDRESS:
        assert(dtype == TGSI_TYPE_SIGNED);
-      assert(LLVMTypeOf(value) == bld_base->base.int_vec_type);
-      lp_exec_mask_store(&bld->exec_mask, bld_store, pred, value,
+      assert(LLVMTypeOf(value) == int_bld->vec_type);
+      value = LLVMBuildBitCast(builder, value, int_bld->vec_type, "");
+      lp_exec_mask_store(&bld->exec_mask, int_bld, pred, value,
                           bld->addr[reg->Register.Index][chan_index]);
        break;
  
     case TGSI_FILE_PREDICATE:
-      lp_exec_mask_store(&bld->exec_mask, bld_store, pred, value,
+      assert(LLVMTypeOf(value) == float_bld->vec_type);
+      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value,
                           bld->preds[reg->Register.Index][chan_index]);
        break;
  
     default:
        assert( 0 );
     }
+
+   (void)dtype;
+}
+
+/*
+ * Called at the beginning of the translation of each TGSI instruction, to
+ * emit some debug code.
+ */
+static void
+emit_debug(
+   struct lp_build_tgsi_context * bld_base,
+   const struct tgsi_full_instruction * inst,
+   const struct tgsi_opcode_info * info)
+
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   if (DEBUG_EXECUTION) {
+      /*
+       * Dump the TGSI instruction.
+       */
+
+      struct gallivm_state *gallivm = bld_base->base.gallivm;
+      char buf[512];
+      buf[0] = '$';
+      buf[1] = ' ';
+      tgsi_dump_instruction_str(inst, bld_base->pc, &buf[2], sizeof buf - 2);
+      lp_build_printf(gallivm, buf);
+
+      /* Dump the execution mask.
+       */
+      if (bld->exec_mask.has_mask) {
+         lp_build_print_value(gallivm, "    mask = ", bld->exec_mask.exec_mask);
+      }
+   }
  }
  
  static void
@@ -1131,6 +1865,93 @@ emit_store(
     }
  }
  
+static unsigned
+tgsi_to_pipe_tex_target(unsigned tgsi_target)
+{
+   switch (tgsi_target) {
+   case TGSI_TEXTURE_BUFFER:
+      return PIPE_BUFFER;
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+      return PIPE_TEXTURE_1D;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_2D_MSAA:
+      return PIPE_TEXTURE_2D;
+   case TGSI_TEXTURE_3D:
+      return PIPE_TEXTURE_3D;
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_SHADOWCUBE:
+      return PIPE_TEXTURE_CUBE;
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOWRECT:
+      return PIPE_TEXTURE_RECT;
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return PIPE_TEXTURE_1D_ARRAY;
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return PIPE_TEXTURE_2D_ARRAY;
+   case TGSI_TEXTURE_CUBE_ARRAY:
+   case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      return PIPE_TEXTURE_CUBE_ARRAY;
+   default:
+      assert(0);
+      return PIPE_BUFFER;
+   }
+}
+
+
+static enum lp_sampler_lod_property
+lp_build_lod_property(
+   struct lp_build_tgsi_context *bld_base,
+   const struct tgsi_full_instruction *inst,
+   unsigned src_op)
+{
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
+   enum lp_sampler_lod_property lod_property;
+
+   /*
+    * Not much we can do here. We could try catching inputs declared
+    * with constant interpolation but not sure it's worth it - since for
+    * TEX opcodes as well as FETCH/LD the lod comes from same reg as
+    * the coords, so it could only work for SAMPLE/TXQ/SVIEWINFO), just
+    * like the constant/immediate recognition below.
+    * What seems to be of more value would be to recognize temps holding
+    * broadcasted scalars but no way we can do it.
+    * Tried asking llvm but without any success (using LLVMIsConstant
+    * even though this isn't exactly what we'd need), even as simple as
+    * IMM[0] UINT32 (0,-1,0,0)
+    * MOV TEMP[0] IMM[0].yyyy
+    * SVIEWINFO TEMP[1], TEMP[0].xxxx, SVIEWINFO[0]
+    * doesn't work.
+    * This means there's ZERO chance this will ever catch a scalar lod
+    * with traditional tex opcodes as well as texel fetches, since the lod
+    * comes from the same reg as coords (except some test shaders using
+    * constant coords maybe).
+    * There's at least hope for sample opcodes as well as size queries.
+    */
+   if (reg->Register.File == TGSI_FILE_CONSTANT ||
+       reg->Register.File == TGSI_FILE_IMMEDIATE) {
+      lod_property = LP_SAMPLER_LOD_SCALAR;
+   }
+   else if (bld_base->info->processor == TGSI_PROCESSOR_FRAGMENT) {
+      if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+         lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
+      }
+      else {
+         lod_property = LP_SAMPLER_LOD_PER_QUAD;
+      }
+   }
+   else {
+      /* never use scalar (per-quad) lod the results are just too wrong. */
+      lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
+   }
+   return lod_property;
+}
+
+
  /**
   * High-level instruction translators.
   */
@@ -1139,19 +1960,20 @@ static void
  emit_tex( struct lp_build_tgsi_soa_context *bld,
            const struct tgsi_full_instruction *inst,
            enum lp_build_tex_modifier modifier,
-          LLVMValueRef *texel)
+          LLVMValueRef *texel,
+          unsigned sampler_reg)
  {
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
-   unsigned unit;
+   unsigned unit = inst->Src[sampler_reg].Register.Index;
     LLVMValueRef lod_bias, explicit_lod;
     LLVMValueRef oow = NULL;
-   LLVMValueRef coords[4];
+   LLVMValueRef coords[5];
     LLVMValueRef offsets[3] = { NULL };
     struct lp_derivatives derivs;
-   unsigned num_coords;
-   unsigned dims;
-   unsigned i;
+   struct lp_derivatives *deriv_ptr = NULL;
+   enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
+   unsigned num_derivs, num_offsets, i;
+   unsigned shadow_coord = 0;
+   unsigned layer_coord = 0;
  
     if (!bld->sampler) {
        _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -1161,55 +1983,94 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
        return;
     }
  
-   derivs.ddx_ddy[0] = bld->bld_base.base.undef;
-   derivs.ddx_ddy[1] = bld->bld_base.base.undef;
-
     switch (inst->Texture.Texture) {
-   case TGSI_TEXTURE_1D:
-      num_coords = 1;
-      dims = 1;
-      break;
     case TGSI_TEXTURE_1D_ARRAY:
-      num_coords = 2;
-      dims = 1;
+      layer_coord = 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_1D:
+      num_offsets = 1;
+      num_derivs = 1;
        break;
+   case TGSI_TEXTURE_2D_ARRAY:
+      layer_coord = 2;
+      /* fallthrough */
     case TGSI_TEXTURE_2D:
     case TGSI_TEXTURE_RECT:
-      num_coords = 2;
-      dims = 2;
+      num_offsets = 2;
+      num_derivs = 2;
        break;
-   case TGSI_TEXTURE_SHADOW1D:
     case TGSI_TEXTURE_SHADOW1D_ARRAY:
-      num_coords = 3;
-      dims = 1;
+      layer_coord = 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_SHADOW1D:
+      shadow_coord = 2;
+      num_offsets = 1;
+      num_derivs = 1;
+      break;
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+      layer_coord = 2;
+      shadow_coord = 3;
+      num_offsets = 2;
+      num_derivs = 2;
        break;
     case TGSI_TEXTURE_SHADOW2D:
     case TGSI_TEXTURE_SHADOWRECT:
-   case TGSI_TEXTURE_2D_ARRAY:
+      shadow_coord = 2;
+      num_offsets = 2;
+      num_derivs = 2;
+      break;
     case TGSI_TEXTURE_CUBE:
-      num_coords = 3;
-      dims = 2;
+      num_offsets = 2;
+      num_derivs = 3;
        break;
     case TGSI_TEXTURE_3D:
-      num_coords = 3;
-      dims = 3;
+      num_offsets = 3;
+      num_derivs = 3;
        break;
-   case TGSI_TEXTURE_SHADOW2D_ARRAY:
-      num_coords = 4;
-      dims = 2;
+   case TGSI_TEXTURE_SHADOWCUBE:
+      shadow_coord = 3;
+      num_offsets = 2;
+      num_derivs = 3;
+      break;
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      num_offsets = 2;
+      num_derivs = 3;
+      layer_coord = 3;
+      break;
+   case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      num_offsets = 2;
+      num_derivs = 3;
+      layer_coord = 3;
+      shadow_coord = 4; /* shadow coord special different reg */
        break;
+   case TGSI_TEXTURE_2D_MSAA:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
     default:
        assert(0);
        return;
     }
  
-   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
-      lod_bias = lp_build_emit_fetch( &bld->bld_base, inst, 0, 3 );
-      explicit_lod = NULL;
-   }
-   else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-      lod_bias = NULL;
-      explicit_lod = lp_build_emit_fetch( &bld->bld_base, inst, 0, 3 );
+   /* Note lod and especially projected are illegal in a LOT of cases */
+   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
+       modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+      LLVMValueRef lod;
+      if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+          inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
+         /* note that shadow cube array with bias/explicit lod does not exist */
+         lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+      }
+      else {
+         lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+      }
+      if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
+         lod_bias = lod;
+         explicit_lod = NULL;
+      }
+      else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+         lod_bias = NULL;
+         explicit_lod = lod;
+      }
+      lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
     }
     else {
        lod_bias = NULL;
@@ -1217,80 +2078,234 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
     }
  
     if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
-      oow = lp_build_emit_fetch( &bld->bld_base, inst, 0, 3 );
+      oow = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
        oow = lp_build_rcp(&bld->bld_base.base, oow);
     }
  
-   for (i = 0; i < num_coords; i++) {
-      coords[i] = lp_build_emit_fetch( &bld->bld_base, inst, 0, i );
+   for (i = 0; i < num_derivs; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, inst, 0, i);
        if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
           coords[i] = lp_build_mul(&bld->bld_base.base, coords[i], oow);
     }
-   for (i = num_coords; i < 4; i++) {
+   for (i = num_derivs; i < 5; i++) {
        coords[i] = bld->bld_base.base.undef;
     }
  
-   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
-      LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef ddxdyonec[3];
-      unsigned length = bld->bld_base.base.type.length;
-      unsigned num_quads = length / 4;
-      unsigned dim;
-      unsigned quad;
-
-      for (dim = 0; dim < dims; ++dim) {
-         LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
-         LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
-         for (quad = 0; quad < num_quads; ++quad) {
-            unsigned s1 = 4*quad;
-            unsigned s2 = 4*quad + length;
-            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
-            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
-            shuffles[4*quad + 2] = i32undef;
-            shuffles[4*quad + 3] = i32undef;
-         }
-         ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
-                                               LLVMConstVector(shuffles, length), "");
-      }
-      if (dims == 1) {
-         derivs.ddx_ddy[0] = ddxdyonec[0];
-      }
-      else if (dims >= 2) {
-         for (quad = 0; quad < num_quads; ++quad) {
-            unsigned s1 = 4*quad;
-            unsigned s2 = 4*quad + length;
-            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
-            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
-            shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
-            shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
-         }
-         derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
-                                                  LLVMConstVector(shuffles, length), "");
-         if (dims == 3) {
-            derivs.ddx_ddy[1] = ddxdyonec[2];
+   /* Layer coord always goes into 3rd slot, except for cube map arrays */
+   if (layer_coord) {
+      if (layer_coord == 3) {
+         coords[3] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      }
+      else {
+         coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      }
+      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
+         coords[2] = lp_build_mul(&bld->bld_base.base, coords[2], oow);
+   }
+   /* Shadow coord occupies always 5th slot. */
+   if (shadow_coord) {
+      if (shadow_coord == 4) {
+         coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+      }
+      else {
+         coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 0, shadow_coord);
+      }
+      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
+         coords[4] = lp_build_mul(&bld->bld_base.base, coords[4], oow);
+   }
+
+   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      unsigned dim;
+      for (dim = 0; dim < num_derivs; ++dim) {
+         derivs.ddx[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 1, dim);
+         derivs.ddy[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 2, dim);
+      }
+      deriv_ptr = &derivs;
+      /*
+       * could also check all src regs if constant but I doubt such
+       * cases exist in practice.
+       */
+      if (bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT) {
+         if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+            lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
           }
+         else {
+            lod_property = LP_SAMPLER_LOD_PER_QUAD;
+         }
+      }
+      else {
+         lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
+      }
+   }
+
+   /* some advanced gather instructions (txgo) would require 4 offsets */
+   if (inst->Texture.NumOffsets == 1) {
+      unsigned dim;
+      for (dim = 0; dim < num_offsets; dim++) {
+         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim);
+      }
+   }
+
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->bld_base.base.gallivm,
+                                  bld->bld_base.base.type,
+                                  FALSE,
+                                  unit, unit,
+                                  coords,
+                                  offsets,
+                                  deriv_ptr,
+                                  lod_bias, explicit_lod, lod_property,
+                                  texel);
+}
+
+static void
+emit_sample(struct lp_build_tgsi_soa_context *bld,
+            const struct tgsi_full_instruction *inst,
+            enum lp_build_tex_modifier modifier,
+            boolean compare,
+            LLVMValueRef *texel)
+{
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   unsigned texture_unit, sampler_unit;
+   LLVMValueRef lod_bias, explicit_lod;
+   LLVMValueRef coords[5];
+   LLVMValueRef offsets[3] = { NULL };
+   struct lp_derivatives derivs;
+   struct lp_derivatives *deriv_ptr = NULL;
+   enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
+
+   unsigned num_offsets, num_derivs, i;
+   unsigned layer_coord = 0;
+
+   if (!bld->sampler) {
+      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
+      for (i = 0; i < 4; i++) {
+         texel[i] = bld->bld_base.base.undef;
+      }
+      return;
+   }
+
+   /*
+    * unlike old-style tex opcodes the texture/sampler indices
+    * always come from src1 and src2 respectively.
+    */
+   texture_unit = inst->Src[1].Register.Index;
+   sampler_unit = inst->Src[2].Register.Index;
+
+   /*
+    * Note inst->Texture.Texture will contain the number of offsets,
+    * however the target information is NOT there and comes from the
+    * declared sampler views instead.
+    */
+   switch (bld->sv[texture_unit].Resource) {
+   case TGSI_TEXTURE_1D:
+      num_offsets = 1;
+      num_derivs = 1;
+      break;
+   case TGSI_TEXTURE_1D_ARRAY:
+      layer_coord = 1;
+      num_offsets = 1;
+      num_derivs = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      num_offsets = 2;
+      num_derivs = 2;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY:
+      layer_coord = 2;
+      num_offsets = 2;
+      num_derivs = 2;
+      break;
+   case TGSI_TEXTURE_CUBE:
+      num_offsets = 2;
+      num_derivs = 3;
+      break;
+   case TGSI_TEXTURE_3D:
+      num_offsets = 3;
+      num_derivs = 3;
+      break;
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      layer_coord = 3;
+      num_offsets = 2;
+      num_derivs = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
+       modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+      LLVMValueRef lod = lp_build_emit_fetch(&bld->bld_base, inst, 3, 0);
+      if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
+         lod_bias = lod;
+         explicit_lod = NULL;
+      }
+      else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+         lod_bias = NULL;
+         explicit_lod = lod;
        }
-      unit = inst->Src[3].Register.Index;
-   }  else {
-      if (dims == 1) {
-         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+      lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
+   }
+   else if (modifier == LP_BLD_TEX_MODIFIER_LOD_ZERO) {
+      lod_bias = NULL;
+      /* XXX might be better to explicitly pass the level zero information */
+      explicit_lod = lp_build_const_vec(gallivm, bld->bld_base.base.type, 0.0F);
+   }
+   else {
+      lod_bias = NULL;
+      explicit_lod = NULL;
+   }
+
+   for (i = 0; i < num_derivs; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, inst, 0, i);
+   }
+   for (i = num_derivs; i < 5; i++) {
+      coords[i] = bld->bld_base.base.undef;
+   }
+
+   /* Layer coord always goes into 3rd slot, except for cube map arrays */
+   if (layer_coord) {
+      if (layer_coord == 3)
+         coords[3] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+      else
+         coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+   }
+   /* Shadow coord occupies always 5th slot. */
+   if (compare) {
+      coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 3, 0);
+   }
+
+   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      unsigned dim;
+      for (dim = 0; dim < num_derivs; ++dim) {
+         derivs.ddx[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 3, dim);
+         derivs.ddy[dim] = lp_build_emit_fetch(&bld->bld_base, inst, 4, dim);
        }
-      else if (dims >= 2) {
-         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
-                                                            coords[0], coords[1]);
-         if (dims == 3) {
-            derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+      deriv_ptr = &derivs;
+      /*
+       * could also check all src regs if constant but I doubt such
+       * cases exist in practice.
+       */
+      if (bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT) {
+         if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) {
+            lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
+         }
+         else {
+            lod_property = LP_SAMPLER_LOD_PER_QUAD;
           }
        }
-      unit = inst->Src[1].Register.Index;
+      else {
+         lod_property = LP_SAMPLER_LOD_PER_ELEMENT;
+      }
     }
  
     /* some advanced gather instructions (txgo) would require 4 offsets */
     if (inst->Texture.NumOffsets == 1) {
        unsigned dim;
-      for (dim = 0; dim < dims; dim++) {
-         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim );
+      for (dim = 0; dim < num_offsets; dim++) {
+         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim);
        }
     }
  
@@ -1298,27 +2313,41 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                    bld->bld_base.base.gallivm,
                                    bld->bld_base.base.type,
                                    FALSE,
-                                  unit, coords,
+                                  texture_unit, sampler_unit,
+                                  coords,
                                    offsets,
-                                  &derivs,
-                                  lod_bias, explicit_lod,
+                                  deriv_ptr,
+                                  lod_bias, explicit_lod, lod_property,
                                    texel);
+
+   if (inst->Src[1].Register.SwizzleX != PIPE_SWIZZLE_RED ||
+       inst->Src[1].Register.SwizzleY != PIPE_SWIZZLE_GREEN ||
+       inst->Src[1].Register.SwizzleZ != PIPE_SWIZZLE_BLUE ||
+       inst->Src[1].Register.SwizzleW != PIPE_SWIZZLE_ALPHA) {
+      unsigned char swizzles[4];
+      swizzles[0] = inst->Src[1].Register.SwizzleX;
+      swizzles[1] = inst->Src[1].Register.SwizzleY;
+      swizzles[2] = inst->Src[1].Register.SwizzleZ;
+      swizzles[3] = inst->Src[1].Register.SwizzleW;
+
+      lp_build_swizzle_soa_inplace(&bld->bld_base.base, texel, swizzles);
+   }
  }
  
  static void
-emit_txf( struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst,
-          LLVMValueRef *texel)
+emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
+                   const struct tgsi_full_instruction *inst,
+                   LLVMValueRef *texel,
+                   boolean is_samplei)
  {
-   unsigned unit;
+   unsigned unit, target;
     LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
     LLVMValueRef explicit_lod = NULL;
-   LLVMValueRef coords[3];
+   LLVMValueRef coords[5];
     LLVMValueRef offsets[3] = { NULL };
-   struct lp_derivatives derivs;
-   unsigned num_coords;
-   unsigned dims;
-   unsigned i;
+   enum lp_sampler_lod_property lod_property = LP_SAMPLER_LOD_SCALAR;
+   unsigned dims, i;
+   unsigned layer_coord = 0;
  
     if (!bld->sampler) {
        _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -1328,30 +2357,35 @@ emit_txf( struct lp_build_tgsi_soa_context *bld,
        return;
     }
  
-   derivs.ddx_ddy[0] = coord_undef;
-   derivs.ddx_ddy[1] = coord_undef;
+   unit = inst->Src[1].Register.Index;
  
-   switch (inst->Texture.Texture) {
+   if (is_samplei) {
+      target = bld->sv[unit].Resource;
+   }
+   else {
+      target = inst->Texture.Texture;
+   }
+
+   switch (target) {
     case TGSI_TEXTURE_1D:
     case TGSI_TEXTURE_BUFFER:
-      num_coords = 1;
        dims = 1;
        break;
     case TGSI_TEXTURE_1D_ARRAY:
-      num_coords = 2;
+      layer_coord = 1;
        dims = 1;
        break;
     case TGSI_TEXTURE_2D:
     case TGSI_TEXTURE_RECT:
-      num_coords = 2;
+   case TGSI_TEXTURE_2D_MSAA:
        dims = 2;
        break;
     case TGSI_TEXTURE_2D_ARRAY:
-      num_coords = 3;
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      layer_coord = 2;
        dims = 2;
        break;
     case TGSI_TEXTURE_3D:
-      num_coords = 3;
        dims = 3;
        break;
     default:
@@ -1359,24 +2393,29 @@ emit_txf( struct lp_build_tgsi_soa_context *bld,
        return;
     }
  
-   /* always have lod except for buffers ? */
-   if (inst->Texture.Texture != TGSI_TEXTURE_BUFFER) {
-      explicit_lod = lp_build_emit_fetch( &bld->bld_base, inst, 0, 3 );
+   /* always have lod except for buffers and msaa targets ? */
+   if (target != TGSI_TEXTURE_BUFFER &&
+       target != TGSI_TEXTURE_2D_MSAA &&
+       target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
+      explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+      lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
     }
+   /* XXX: for real msaa support, the w component would be the sample index. */
  
-   for (i = 0; i < num_coords; i++) {
-      coords[i] = lp_build_emit_fetch( &bld->bld_base, inst, 0, i );
+   for (i = 0; i < dims; i++) {
+      coords[i] = lp_build_emit_fetch(&bld->bld_base, inst, 0, i);
     }
-   for (i = num_coords; i < 3; i++) {
+   /* never use more than 3 coords here but emit_fetch_texel copies all 5 anyway */
+   for (i = dims; i < 5; i++) {
        coords[i] = coord_undef;
     }
-
-   unit = inst->Src[1].Register.Index;
+   if (layer_coord)
+      coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
  
     if (inst->Texture.NumOffsets == 1) {
        unsigned dim;
        for (dim = 0; dim < dims; dim++) {
-         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim );
+         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim);
        }
     }
  
@@ -1384,91 +2423,90 @@ emit_txf( struct lp_build_tgsi_soa_context *bld,
                                    bld->bld_base.base.gallivm,
                                    bld->bld_base.base.type,
                                    TRUE,
-                                  unit, coords,
+                                  unit, unit,
+                                  coords,
                                    offsets,
-                                  &derivs,
-                                  NULL, explicit_lod,
+                                  NULL,
+                                  NULL, explicit_lod, lod_property,
                                    texel);
+
+   if (is_samplei &&
+       (inst->Src[1].Register.SwizzleX != PIPE_SWIZZLE_RED ||
+        inst->Src[1].Register.SwizzleY != PIPE_SWIZZLE_GREEN ||
+        inst->Src[1].Register.SwizzleZ != PIPE_SWIZZLE_BLUE ||
+        inst->Src[1].Register.SwizzleW != PIPE_SWIZZLE_ALPHA)) {
+      unsigned char swizzles[4];
+      swizzles[0] = inst->Src[1].Register.SwizzleX;
+      swizzles[1] = inst->Src[1].Register.SwizzleY;
+      swizzles[2] = inst->Src[1].Register.SwizzleZ;
+      swizzles[3] = inst->Src[1].Register.SwizzleW;
+
+      lp_build_swizzle_soa_inplace(&bld->bld_base.base, texel, swizzles);
+   }
  }
  
  static void
-emit_txq( struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst,
-          LLVMValueRef *sizes_out)
+emit_size_query( struct lp_build_tgsi_soa_context *bld,
+                 const struct tgsi_full_instruction *inst,
+                 LLVMValueRef *sizes_out,
+                 boolean is_sviewinfo)
  {
     LLVMValueRef explicit_lod;
-   unsigned num_coords, has_lod;
+   enum lp_sampler_lod_property lod_property;
+   unsigned has_lod;
     unsigned i;
+   unsigned unit = inst->Src[1].Register.Index;
+   unsigned target, pipe_target;
  
-   switch (inst->Texture.Texture) {
-   case TGSI_TEXTURE_1D:
-   case TGSI_TEXTURE_SHADOW1D:
-   case TGSI_TEXTURE_SHADOW2D:
-   case TGSI_TEXTURE_SHADOWCUBE:
-      num_coords = 1;
-      has_lod = 1;
-      break;
-   case TGSI_TEXTURE_2D:
-   case TGSI_TEXTURE_CUBE:
-   case TGSI_TEXTURE_1D_ARRAY:
-   case TGSI_TEXTURE_SHADOW1D_ARRAY:
-      num_coords = 2;
-      has_lod = 1;
-      break;
-   case TGSI_TEXTURE_3D:
-// case TGSI_TEXTURE_CUBE_ARRAY:
-// case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-   case TGSI_TEXTURE_2D_ARRAY:
-   case TGSI_TEXTURE_SHADOW2D_ARRAY:
-      num_coords = 3;
-      has_lod = 1;
-      break;
-
+   if (is_sviewinfo) {
+      target = bld->sv[unit].Resource;
+   }
+   else {
+      target = inst->Texture.Texture;
+   }
+   switch (target) {
     case TGSI_TEXTURE_BUFFER:
-      num_coords = 1;
-      has_lod = 0;
-      break;
-
     case TGSI_TEXTURE_RECT:
     case TGSI_TEXTURE_SHADOWRECT:
-// case TGSI_TEXTURE_2D_MS:
-      num_coords = 2;
        has_lod = 0;
        break;
-
-// case TGSI_TEXTURE_2D_MS_ARRAY:
-//    num_coords = 3;
-//    has_lod = 0;
-//    break;
-
     default:
-      assert(0);
-      return;
+      has_lod = 1;
+      break;
     }
  
     if (!bld->sampler) {
        _debug_printf("warning: found texture query instruction but no sampler generator supplied\n");
-      for (i = 0; i < num_coords; i++)
-         sizes_out[i] = bld->bld_base.base.undef;
+      for (i = 0; i < 4; i++)
+         sizes_out[i] = bld->bld_base.int_bld.undef;
        return;
     }
  
-   if (has_lod)
-      explicit_lod = lp_build_emit_fetch( &bld->bld_base, inst, 0, 2 );
-   else
+   if (has_lod) {
+      explicit_lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 0);
+      lod_property = lp_build_lod_property(&bld->bld_base, inst, 0);
+   }
+   else {
        explicit_lod = NULL;
+      lod_property = LP_SAMPLER_LOD_SCALAR;
+   }
+
+
+   pipe_target = tgsi_to_pipe_tex_target(target);
  
     bld->sampler->emit_size_query(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.int_bld.type,
-                                 inst->Src[1].Register.Index,
+                                 unit, pipe_target,
+                                 TRUE,
+                                 lod_property,
                                   explicit_lod,
                                   sizes_out);
  }
  
  static boolean
  near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
-                  int pc)
+                   int pc)
  {
     int i;
  
@@ -1476,27 +2514,38 @@ near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
        unsigned opcode;
  
        if (pc + i >= bld->bld_base.info->num_instructions)
-        return TRUE;
+         return TRUE;
  
        opcode = bld->bld_base.instructions[pc + i].Instruction.Opcode;
  
        if (opcode == TGSI_OPCODE_END)
-        return TRUE;
+         return TRUE;
  
        if (opcode == TGSI_OPCODE_TEX ||
-         opcode == TGSI_OPCODE_TXP ||
-         opcode == TGSI_OPCODE_TXD ||
-         opcode == TGSI_OPCODE_TXB ||
-         opcode == TGSI_OPCODE_TXL ||
-         opcode == TGSI_OPCODE_TXF ||
-         opcode == TGSI_OPCODE_TXQ ||
-         opcode == TGSI_OPCODE_CAL ||
-         opcode == TGSI_OPCODE_CALLNZ ||
-         opcode == TGSI_OPCODE_IF ||
-         opcode == TGSI_OPCODE_IFC ||
-         opcode == TGSI_OPCODE_BGNLOOP ||
-         opcode == TGSI_OPCODE_SWITCH)
-        return FALSE;
+         opcode == TGSI_OPCODE_TXP ||
+         opcode == TGSI_OPCODE_TXD ||
+         opcode == TGSI_OPCODE_TXB ||
+         opcode == TGSI_OPCODE_TXL ||
+         opcode == TGSI_OPCODE_TXF ||
+         opcode == TGSI_OPCODE_TXQ ||
+         opcode == TGSI_OPCODE_TEX2 ||
+         opcode == TGSI_OPCODE_TXB2 ||
+         opcode == TGSI_OPCODE_TXL2 ||
+         opcode == TGSI_OPCODE_SAMPLE ||
+         opcode == TGSI_OPCODE_SAMPLE_B ||
+         opcode == TGSI_OPCODE_SAMPLE_C ||
+         opcode == TGSI_OPCODE_SAMPLE_C_LZ ||
+         opcode == TGSI_OPCODE_SAMPLE_D ||
+         opcode == TGSI_OPCODE_SAMPLE_I ||
+         opcode == TGSI_OPCODE_SAMPLE_L ||
+         opcode == TGSI_OPCODE_SVIEWINFO ||
+         opcode == TGSI_OPCODE_CAL ||
+         opcode == TGSI_OPCODE_CALLNZ ||
+         opcode == TGSI_OPCODE_IF ||
+         opcode == TGSI_OPCODE_UIF ||
+         opcode == TGSI_OPCODE_BGNLOOP ||
+         opcode == TGSI_OPCODE_SWITCH)
+         return FALSE;
     }
  
     return TRUE;
@@ -1508,7 +2557,7 @@ near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
   * Kill fragment if any of the src register values are negative.
   */
  static void
-emit_kil(
+emit_kill_if(
     struct lp_build_tgsi_soa_context *bld,
     const struct tgsi_full_instruction *inst,
     int pc)
@@ -1551,23 +2600,25 @@ emit_kil(
        }
     }
  
-   if(mask) {
-      lp_build_mask_update(bld->mask, mask);
-
-      if (!near_end_of_shader(bld, pc))
-        lp_build_mask_check(bld->mask);
+   if (bld->exec_mask.has_mask) {
+      LLVMValueRef invmask;
+      invmask = LLVMBuildNot(builder, bld->exec_mask.exec_mask, "kilp");
+      mask = LLVMBuildOr(builder, mask, invmask, "");
     }
+
+   lp_build_mask_update(bld->mask, mask);
+   if (!near_end_of_shader(bld, pc))
+      lp_build_mask_check(bld->mask);
  }
  
  
  /**
- * Predicated fragment kill.
- * XXX Actually, we do an unconditional kill (as in tgsi_exec.c).
+ * Unconditional fragment kill.
   * The only predication is the execution mask which will apply if
   * we're inside a loop or conditional.
   */
  static void
-emit_kilp(struct lp_build_tgsi_soa_context *bld,
+emit_kill(struct lp_build_tgsi_soa_context *bld,
            int pc)
  {
     LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
@@ -1596,42 +2647,79 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
   * to stdout.
   */
  static void
-emit_dump_temps(struct lp_build_tgsi_soa_context *bld)
+emit_dump_file(struct lp_build_tgsi_soa_context *bld,
+               unsigned file)
  {
+   const struct tgsi_shader_info *info = bld->bld_base.info;
     struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
     LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef temp_ptr;
-   LLVMValueRef i0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef i1 = lp_build_const_int32(gallivm, 1);
-   LLVMValueRef i2 = lp_build_const_int32(gallivm, 2);
-   LLVMValueRef i3 = lp_build_const_int32(gallivm, 3);
+   LLVMValueRef reg_ptr;
     int index;
-   int n = bld->bld_base.info->file_max[TGSI_FILE_TEMPORARY];
+   int max_index = info->file_max[file];
+
+   /*
+    * Some register files, particularly constants, can be very large,
+    * and dumping everything could make this unusably slow.
+    */
+   max_index = MIN2(max_index, 32);
  
-   for (index = 0; index < n; index++) {
-      LLVMValueRef idx = lp_build_const_int32(gallivm, index);
-      LLVMValueRef v[4][4], res;
+   for (index = 0; index <= max_index; index++) {
+      LLVMValueRef res;
+      unsigned mask;
        int chan;
  
-      lp_build_printf(gallivm, "TEMP[%d]:\n", idx);
+      if (index < 8 * sizeof(unsigned) &&
+          (info->file_mask[file] & (1 << index)) == 0)  {
+         /* This was not declared.*/
+         continue;
+      }
  
-      for (chan = 0; chan < 4; chan++) {
-         temp_ptr = lp_get_temp_ptr_soa(bld, index, chan);
-         res = LLVMBuildLoad(builder, temp_ptr, "");
-         v[chan][0] = LLVMBuildExtractElement(builder, res, i0, "");
-         v[chan][1] = LLVMBuildExtractElement(builder, res, i1, "");
-         v[chan][2] = LLVMBuildExtractElement(builder, res, i2, "");
-         v[chan][3] = LLVMBuildExtractElement(builder, res, i3, "");
+      if (file == TGSI_FILE_INPUT) {
+         mask = info->input_usage_mask[index];
+      } else {
+         mask = TGSI_WRITEMASK_XYZW;
        }
  
-      lp_build_printf(gallivm, "  X: %f %f %f %f\n",
-                      v[0][0], v[0][1], v[0][2], v[0][3]);
-      lp_build_printf(gallivm, "  Y: %f %f %f %f\n",
-                      v[1][0], v[1][1], v[1][2], v[1][3]);
-      lp_build_printf(gallivm, "  Z: %f %f %f %f\n",
-                      v[2][0], v[2][1], v[2][2], v[2][3]);
-      lp_build_printf(gallivm, "  W: %f %f %f %f\n",
-                      v[3][0], v[3][1], v[3][2], v[3][3]);
+      for (chan = 0; chan < 4; chan++) {
+         if ((mask & (1 << chan)) == 0) {
+            /* This channel is not used.*/
+            continue;
+         }
+
+         if (file == TGSI_FILE_CONSTANT) {
+            struct tgsi_full_src_register reg;
+            memset(&reg, 0, sizeof reg);
+            reg.Register.File = file;
+            reg.Register.Index = index;
+            reg.Register.SwizzleX = 0;
+            reg.Register.SwizzleY = 1;
+            reg.Register.SwizzleZ = 2;
+            reg.Register.SwizzleW = 3;
+
+            res = bld->bld_base.emit_fetch_funcs[file](&bld->bld_base, &reg, TGSI_TYPE_FLOAT, chan);
+            if (!res) {
+               continue;
+            }
+         } else if (file == TGSI_FILE_INPUT) {
+            res = bld->inputs[index][chan];
+            if (!res) {
+               continue;
+            }
+         } else if (file == TGSI_FILE_TEMPORARY) {
+            reg_ptr = lp_get_temp_ptr_soa(bld, index, chan);
+            assert(reg_ptr);
+            res = LLVMBuildLoad(builder, reg_ptr, "");
+         } else if (file == TGSI_FILE_OUTPUT) {
+            reg_ptr = lp_get_output_ptr(bld, index, chan);
+            assert(reg_ptr);
+            res = LLVMBuildLoad(builder, reg_ptr, "");
+         } else {
+            assert(0);
+            continue;
+         }
+
+         emit_dump_reg(gallivm, file, index, chan, res);
+      }
     }
  }
  
@@ -1649,218 +2737,582 @@ lp_emit_declaration_soa(
     const unsigned last = decl->Range.Last;
     unsigned idx, i;
  
-   for (idx = first; idx <= last; ++idx) {
-      assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
-      switch (decl->Declaration.File) {
-      case TGSI_FILE_TEMPORARY:
-         assert(idx < LP_MAX_TGSI_TEMPS);
-         if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
-            for (i = 0; i < TGSI_NUM_CHANNELS; i++)
-               bld->temps[idx][i] = lp_build_alloca(gallivm, vec_type, "temp");
-         }
-         break;
+   assert(last <= bld->bld_base.info->file_max[decl->Declaration.File]);
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
+         assert(last < LP_MAX_INLINED_TEMPS);
+         for (idx = first; idx <= last; ++idx) {
+            for (i = 0; i < TGSI_NUM_CHANNELS; i++)
+               bld->temps[idx][i] = lp_build_alloca(gallivm, vec_type, "temp");
+         }
+      }
+      break;
+
+   case TGSI_FILE_OUTPUT:
+      if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+         for (idx = first; idx <= last; ++idx) {
+            for (i = 0; i < TGSI_NUM_CHANNELS; i++)
+               bld->outputs[idx][i] = lp_build_alloca(gallivm,
+                                                      vec_type, "output");
+         }
+      }
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      /* ADDR registers are only allocated with an integer LLVM IR type,
+       * as they are guaranteed to always have integers.
+       * XXX: Not sure if this exception is worthwhile (or the whole idea of
+       * an ADDR register for that matter).
+       */
+      assert(last < LP_MAX_TGSI_ADDRS);
+      for (idx = first; idx <= last; ++idx) {
+         assert(idx < LP_MAX_TGSI_ADDRS);
+         for (i = 0; i < TGSI_NUM_CHANNELS; i++)
+            bld->addr[idx][i] = lp_build_alloca(gallivm, bld_base->base.int_vec_type, "addr");
+      }
+      break;
+
+   case TGSI_FILE_PREDICATE:
+      assert(last < LP_MAX_TGSI_PREDS);
+      for (idx = first; idx <= last; ++idx) {
+         for (i = 0; i < TGSI_NUM_CHANNELS; i++)
+            bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
+                                                 "predicate");
+      }
+      break;
+
+   case TGSI_FILE_SAMPLER_VIEW:
+      /*
+       * The target stored here MUST match whatever there actually
+       * is in the set sampler views (what about return type?).
+       */
+      assert(last < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+      for (idx = first; idx <= last; ++idx) {
+         bld->sv[idx] = decl->SamplerView;
+      }
+      break;
+
+   case TGSI_FILE_CONSTANT:
+   {
+      /*
+       * We could trivially fetch the per-buffer pointer when fetching the
+       * constant, relying on llvm to figure out it's always the same pointer
+       * anyway. However, doing so results in a huge (more than factor of 10)
+       * slowdown in llvm compilation times for some (but not all) shaders
+       * (more specifically, the IR optimization spends way more time in
+       * DominatorTree::dominates). At least with llvm versions 3.1, 3.3.
+       */
+      unsigned idx2D = decl->Dim.Index2D;
+      LLVMValueRef index2D = lp_build_const_int32(gallivm, idx2D);
+      assert(idx2D < LP_MAX_TGSI_CONST_BUFFERS);
+      bld->consts[idx2D] =
+         lp_build_array_get(gallivm, bld->consts_ptr, index2D);
+      bld->consts_sizes[idx2D] =
+         lp_build_array_get(gallivm, bld->const_sizes_ptr, index2D);
+   }
+      break;
+
+   default:
+      /* don't need to declare other vars */
+      break;
+   }
+}
+
+
+void lp_emit_immediate_soa(
+   struct lp_build_tgsi_context *bld_base,
+   const struct tgsi_full_immediate *imm)
+{
+   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   LLVMValueRef imms[4];
+   unsigned i;
+   const uint size = imm->Immediate.NrTokens - 1;
+   assert(size <= 4);
+   switch (imm->Immediate.DataType) {
+   case TGSI_IMM_FLOAT32:
+      for( i = 0; i < size; ++i )
+         imms[i] =
+               lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
+
+      break;
+   case TGSI_IMM_UINT32:
+      for( i = 0; i < size; ++i ) {
+         LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
+         imms[i] = LLVMConstBitCast(tmp, bld_base->base.vec_type);
+      }
+
+      break;
+   case TGSI_IMM_INT32:
+      for( i = 0; i < size; ++i ) {
+         LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->int_bld.type, imm->u[i].Int);
+         imms[i] = LLVMConstBitCast(tmp, bld_base->base.vec_type);
+      }
+
+      break;
+   }
+   for( i = size; i < 4; ++i )
+      imms[i] = bld_base->base.undef;
+
+   if (bld->use_immediates_array) {
+      unsigned index = bld->num_immediates;
+      struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+      LLVMBuilderRef builder = gallivm->builder;
+
+      assert(bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE));
+      for (i = 0; i < 4; ++i ) {
+         LLVMValueRef lindex = lp_build_const_int32(
+                  bld->bld_base.base.gallivm, index * 4 + i);
+         LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
+                                             bld->imms_array, &lindex, 1, "");
+         LLVMBuildStore(builder, imms[i], imm_ptr);
+      }
+   } else {
+      /* simply copy the immediate values into the next immediates[] slot */
+      unsigned i;
+      const uint size = imm->Immediate.NrTokens - 1;
+      assert(size <= 4);
+      assert(bld->num_immediates < LP_MAX_INLINED_IMMEDIATES);
+
+      for(i = 0; i < 4; ++i )
+         bld->immediates[bld->num_immediates][i] = imms[i];
+
+      if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
+         unsigned index = bld->num_immediates;
+         struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+         LLVMBuilderRef builder = gallivm->builder;
+         for (i = 0; i < 4; ++i ) {
+            LLVMValueRef lindex = lp_build_const_int32(
+                     bld->bld_base.base.gallivm, index * 4 + i);
+            LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
+                                                bld->imms_array, &lindex, 1, "");
+            LLVMBuildStore(builder,
+                           bld->immediates[index][i],
+                           imm_ptr);
+         }
+      }
+   }
+
+   bld->num_immediates++;
+}
+
+static void
+ddx_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_fetch_deriv(bld, emit_data->args[0], NULL,
+                    &emit_data->output[emit_data->chan], NULL);
+}
+
+static void
+ddy_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_fetch_deriv(bld, emit_data->args[0], NULL, NULL,
+                    &emit_data->output[emit_data->chan]);
+}
+
+static void
+kill_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_kill(bld, bld_base->pc - 1);
+}
+
+static void
+kill_if_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_kill_if(bld, emit_data->inst, bld_base->pc - 1);
+}
+
+static void
+tex_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 1);
+}
+
+static void
+tex2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 2);
+}
+
+static void
+txb_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
+            emit_data->output, 1);
+}
+
+static void
+txb2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
+            emit_data->output, 2);
+}
  
-      case TGSI_FILE_OUTPUT:
-         if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
-            for (i = 0; i < TGSI_NUM_CHANNELS; i++)
-               bld->outputs[idx][i] = lp_build_alloca(gallivm,
-                                                      vec_type, "output");
-         }
-         break;
+static void
+txd_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-      case TGSI_FILE_ADDRESS:
-        /* ADDR registers are the only allocated with an integer LLVM IR type,
-         * as they are guaranteed to always have integers.
-         * XXX: Not sure if this exception is worthwhile (or the whole idea of
-         * an ADDR register for that matter).
-         */
-         assert(idx < LP_MAX_TGSI_ADDRS);
-         for (i = 0; i < TGSI_NUM_CHANNELS; i++)
-            bld->addr[idx][i] = lp_build_alloca(gallivm, bld_base->base.int_vec_type, "addr");
-         break;
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
+            emit_data->output, 3);
+}
  
-      case TGSI_FILE_PREDICATE:
-         assert(idx < LP_MAX_TGSI_PREDS);
-         for (i = 0; i < TGSI_NUM_CHANNELS; i++)
-            bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
-                                                 "predicate");
-         break;
+static void
+txl_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-      default:
-         /* don't need to declare other vars */
-         break;
-      }
-   }
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+            emit_data->output, 1);
  }
  
-
-void lp_emit_immediate_soa(
-   struct lp_build_tgsi_context *bld_base,
-   const struct tgsi_full_immediate *imm)
+static void
+txl2_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
  {
-   struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-   struct gallivm_state * gallivm = bld_base->base.gallivm;
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   /* simply copy the immediate values into the next immediates[] slot */
-   unsigned i;
-   const uint size = imm->Immediate.NrTokens - 1;
-   assert(size <= 4);
-   assert(bld->num_immediates < LP_MAX_TGSI_IMMEDIATES);
-   switch (imm->Immediate.DataType) {
-   case TGSI_IMM_FLOAT32:
-      for( i = 0; i < size; ++i )
-         bld->immediates[bld->num_immediates][i] =
-            lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+            emit_data->output, 2);
+}
  
-      break;
-   case TGSI_IMM_UINT32:
-      for( i = 0; i < size; ++i ) {
-         LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
-         bld->immediates[bld->num_immediates][i] =
-            LLVMConstBitCast(tmp, bld_base->base.vec_type);
-      }
+static void
+txp_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-      break;
-   case TGSI_IMM_INT32:
-      for( i = 0; i < size; ++i ) {
-         LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->int_bld.type, imm->u[i].Int);
-         bld->immediates[bld->num_immediates][i] =
-            LLVMConstBitCast(tmp, bld_base->base.vec_type);
-      }
-            
-      break;
-   }
-   for( i = size; i < 4; ++i )
-      bld->immediates[bld->num_immediates][i] = bld_base->base.undef;
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_PROJECTED,
+            emit_data->output, 1);
+}
  
-   bld->num_immediates++;
+static void
+txq_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_size_query(bld, emit_data->inst, emit_data->output, FALSE);
  }
  
  static void
-ddx_emit(
+txf_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_fetch_deriv(bld, emit_data->args[0], NULL,
-                    &emit_data->output[emit_data->chan], NULL);
+   emit_fetch_texels(bld, emit_data->inst, emit_data->output, FALSE);
  }
  
  static void
-ddy_emit(
+sample_i_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_fetch_deriv(bld, emit_data->args[0], NULL, NULL,
-                    &emit_data->output[emit_data->chan]);
+   emit_fetch_texels(bld, emit_data->inst, emit_data->output, TRUE);
  }
  
  static void
-kilp_emit(
+sample_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_kilp(bld, bld_base->pc - 1);
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               FALSE, emit_data->output);
  }
  
  static void
-kil_emit(
+sample_b_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_kil(bld, emit_data->inst, bld_base->pc - 1);
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
+               FALSE, emit_data->output);
  }
  
  static void
-tex_emit(
+sample_c_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE, emit_data->output);
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+               TRUE, emit_data->output);
  }
  
  static void
-txb_emit(
+sample_c_lz_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
-            emit_data->output);
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_ZERO,
+               TRUE, emit_data->output);
  }
  
  static void
-txd_emit(
+sample_d_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
-            emit_data->output);
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
+               FALSE, emit_data->output);
  }
  
  static void
-txl_emit(
+sample_l_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
-            emit_data->output);
+   emit_sample(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+               FALSE, emit_data->output);
  }
  
  static void
-txp_emit(
+sviewinfo_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_PROJECTED,
-            emit_data->output);
+   emit_size_query(bld, emit_data->inst, emit_data->output, TRUE);
+}
+
+static LLVMValueRef
+mask_vec(struct lp_build_tgsi_context *bld_base)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_exec_mask *exec_mask = &bld->exec_mask;
+
+   if (!exec_mask->has_mask) {
+      return lp_build_mask_value(bld->mask);
+   }
+   return LLVMBuildAnd(builder, lp_build_mask_value(bld->mask),
+                       exec_mask->exec_mask, "");
  }
  
  static void
-txq_emit(
+increment_vec_ptr_by_mask(struct lp_build_tgsi_context * bld_base,
+                          LLVMValueRef ptr,
+                          LLVMValueRef mask)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMValueRef current_vec = LLVMBuildLoad(builder, ptr, "");
+
+   current_vec = LLVMBuildSub(builder, current_vec, mask, "");
+
+   LLVMBuildStore(builder, current_vec, ptr);
+}
+
+static void
+clear_uint_vec_ptr_from_mask(struct lp_build_tgsi_context * bld_base,
+                             LLVMValueRef ptr,
+                             LLVMValueRef mask)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMValueRef current_vec = LLVMBuildLoad(builder, ptr, "");
+
+   current_vec = lp_build_select(&bld_base->uint_bld,
+                                 mask,
+                                 bld_base->uint_bld.zero,
+                                 current_vec);
+
+   LLVMBuildStore(builder, current_vec, ptr);
+}
+
+static LLVMValueRef
+clamp_mask_to_max_output_vertices(struct lp_build_tgsi_soa_context * bld,
+                                  LLVMValueRef current_mask_vec,
+                                  LLVMValueRef total_emitted_vertices_vec)
+{
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct lp_build_context *int_bld = &bld->bld_base.int_bld;
+   LLVMValueRef max_mask = lp_build_cmp(int_bld, PIPE_FUNC_LESS,
+                                        total_emitted_vertices_vec,
+                                        bld->max_output_vertices_vec);
+
+   return LLVMBuildAnd(builder, current_mask_vec, max_mask, "");
+}
+
+static void
+emit_vertex(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
  
-   emit_txq(bld, emit_data->inst, emit_data->output);
+   if (bld->gs_iface->emit_vertex) {
+      LLVMValueRef mask = mask_vec(bld_base);
+      LLVMValueRef total_emitted_vertices_vec =
+         LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
+      mask = clamp_mask_to_max_output_vertices(bld, mask,
+                                               total_emitted_vertices_vec);
+      gather_outputs(bld);
+      bld->gs_iface->emit_vertex(bld->gs_iface, &bld->bld_base,
+                                 bld->outputs,
+                                 total_emitted_vertices_vec);
+      increment_vec_ptr_by_mask(bld_base, bld->emitted_vertices_vec_ptr,
+                                mask);
+      increment_vec_ptr_by_mask(bld_base, bld->total_emitted_vertices_vec_ptr,
+                                mask);
+#if DUMP_GS_EMITS
+      lp_build_print_value(bld->bld_base.base.gallivm,
+                           " +++ emit vertex masked ones = ",
+                           mask);
+      lp_build_print_value(bld->bld_base.base.gallivm,
+                           " +++ emit vertex emitted = ",
+                           total_emitted_vertices_vec);
+#endif
+   }
  }
  
+
  static void
-txf_emit(
+end_primitive_masked(struct lp_build_tgsi_context * bld_base,
+                     LLVMValueRef mask)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+
+   if (bld->gs_iface->end_primitive) {
+      struct lp_build_context *uint_bld = &bld_base->uint_bld;
+      LLVMValueRef emitted_vertices_vec =
+         LLVMBuildLoad(builder, bld->emitted_vertices_vec_ptr, "");
+      LLVMValueRef emitted_prims_vec =
+         LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
+
+      LLVMValueRef emitted_mask = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
+                                               emitted_vertices_vec,
+                                               uint_bld->zero);
+      /* We need to combine the current execution mask with the mask
+         telling us which, if any, execution slots actually have
+         unemitted primitives, this way we make sure that end_primitives
+         executes only on the paths that have unflushed vertices */
+      mask = LLVMBuildAnd(builder, mask, emitted_mask, "");
+
+      bld->gs_iface->end_primitive(bld->gs_iface, &bld->bld_base,
+                                   emitted_vertices_vec,
+                                   emitted_prims_vec);
+
+#if DUMP_GS_EMITS
+      lp_build_print_value(bld->bld_base.base.gallivm,
+                           " +++ end prim masked ones = ",
+                           mask);
+      lp_build_print_value(bld->bld_base.base.gallivm,
+                           " +++ end prim emitted verts1 = ",
+                           emitted_vertices_vec);
+      lp_build_print_value(bld->bld_base.base.gallivm,
+                           " +++ end prim emitted prims1 = ",
+                           LLVMBuildLoad(builder,
+                                         bld->emitted_prims_vec_ptr, ""));
+#endif
+      increment_vec_ptr_by_mask(bld_base, bld->emitted_prims_vec_ptr,
+                                mask);
+      clear_uint_vec_ptr_from_mask(bld_base, bld->emitted_vertices_vec_ptr,
+                                   mask);
+#if DUMP_GS_EMITS
+      lp_build_print_value(bld->bld_base.base.gallivm,
+                           " +++ end prim emitted verts2 = ",
+                           LLVMBuildLoad(builder,
+                                         bld->emitted_vertices_vec_ptr, ""));
+#endif
+   }
+
+}
+
+static void
+end_primitive(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   emit_txf(bld, emit_data->inst, emit_data->output);
+   if (bld->gs_iface->end_primitive) {
+      LLVMValueRef mask = mask_vec(bld_base);
+      end_primitive_masked(bld_base, mask);
+   }
  }
  
  static void
@@ -1894,7 +3346,25 @@ brk_emit(
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_break(&bld->exec_mask);
+   lp_exec_break(&bld->exec_mask, bld_base);
+}
+
+static void
+breakc_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   LLVMValueRef unsigned_cond = 
+      LLVMBuildBitCast(builder, emit_data->args[0], uint_bld->vec_type, "");
+   LLVMValueRef cond = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
+                                    unsigned_cond,
+                                    uint_bld->zero);
+
+   lp_exec_break_condition(&bld->exec_mask, cond);
  }
  
  static void
@@ -1912,173 +3382,139 @@ if_emit(
  }
  
  static void
-bgnloop_emit(
+uif_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
+   LLVMValueRef tmp;
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
  
-   lp_exec_bgnloop(&bld->exec_mask);
+   tmp = lp_build_cmp(uint_bld, PIPE_FUNC_NOTEQUAL,
+                      emit_data->args[0], uint_bld->zero);
+   lp_exec_mask_cond_push(&bld->exec_mask, tmp);
  }
  
  static void
-bgnsub_emit(
+case_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_mask_bgnsub(&bld->exec_mask);
+   lp_exec_case(&bld->exec_mask, emit_data->args[0]);
  }
  
  static void
-else_emit(
+default_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_mask_cond_invert(&bld->exec_mask);
+   lp_exec_default(&bld->exec_mask, bld_base);
  }
  
  static void
-endif_emit(
+switch_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_mask_cond_pop(&bld->exec_mask);
+   lp_exec_switch(&bld->exec_mask, emit_data->args[0]);
  }
  
  static void
-endloop_emit(
+endswitch_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask);
+   lp_exec_endswitch(&bld->exec_mask, bld_base);
  }
  
  static void
-endsub_emit(
+bgnloop_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_mask_endsub(&bld->exec_mask, &bld_base->pc);
+   lp_exec_bgnloop(&bld->exec_mask);
  }
  
  static void
-cont_emit(
+bgnsub_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   lp_exec_continue(&bld->exec_mask);
+   lp_exec_mask_bgnsub(&bld->exec_mask);
  }
  
-/* XXX: Refactor and move it to lp_bld_tgsi_action.c
- *
- * XXX: What do the comments about xmm registers mean?  Maybe they are left over
- * from old code, but there is no garauntee that LLVM will use those registers
- * for this code.
- *
- * XXX: There should be no calls to lp_build_emit_fetch in this function.  This
- * should be handled by the emit_data->fetch_args function. */
  static void
-nrm_emit(
+else_emit(
     const struct lp_build_tgsi_action * action,
     struct lp_build_tgsi_context * bld_base,
     struct lp_build_emit_data * emit_data)
  {
-   LLVMValueRef tmp0, tmp1;
-   LLVMValueRef tmp4 = NULL;
-   LLVMValueRef tmp5 = NULL;
-   LLVMValueRef tmp6 = NULL;
-   LLVMValueRef tmp7 = NULL;
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   uint dims = (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+   lp_exec_mask_cond_invert(&bld->exec_mask);
+}
  
-  if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X) ||
-      TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y) ||
-      TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z) ||
-      (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W) && dims == 4)) {
+static void
+endif_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-      /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+   lp_exec_mask_cond_pop(&bld->exec_mask);
+}
  
-      /* xmm4 = src.x */
-      /* xmm0 = src.x * src.x */
-      tmp0 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_X);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X)) {
-         tmp4 = tmp0;
-      }
-      tmp0 = lp_build_mul( &bld->bld_base.base, tmp0, tmp0);
+static void
+endloop_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-      /* xmm5 = src.y */
-      /* xmm0 = xmm0 + src.y * src.y */
-      tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_Y);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y)) {
-         tmp5 = tmp1;
-      }
-      tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-      tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
+   lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask);
+}
  
-      /* xmm6 = src.z */
-      /* xmm0 = xmm0 + src.z * src.z */
-      tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_Z);
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z)) {
-         tmp6 = tmp1;
-      }
-      tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-      tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
+static void
+endsub_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-      if (dims == 4) {
-         /* xmm7 = src.w */
-         /* xmm0 = xmm0 + src.w * src.w */
-         tmp1 = lp_build_emit_fetch(&bld->bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-         if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W)) {
-            tmp7 = tmp1;
-         }
-         tmp1 = lp_build_mul( &bld->bld_base.base, tmp1, tmp1);
-         tmp0 = lp_build_add( &bld->bld_base.base, tmp0, tmp1);
-      }
-      /* xmm1 = 1 / sqrt(xmm0) */
-      tmp1 = lp_build_rsqrt( &bld->bld_base.base, tmp0);
-       /* dst.x = xmm1 * src.x */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X)) {
-         emit_data->output[TGSI_CHAN_X] = lp_build_mul( &bld->bld_base.base, tmp4, tmp1);
-      }
-      /* dst.y = xmm1 * src.y */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Y)) {
-         emit_data->output[TGSI_CHAN_Y] = lp_build_mul( &bld->bld_base.base, tmp5, tmp1);
-      }
+   lp_exec_mask_endsub(&bld->exec_mask, &bld_base->pc);
+}
  
-      /* dst.z = xmm1 * src.z */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_Z)) {
-         emit_data->output[TGSI_CHAN_Z] = lp_build_mul( &bld->bld_base.base, tmp6, tmp1);
-      }
-      /* dst.w = xmm1 * src.w */
-      if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_X) && dims == 4) {
-         emit_data->output[TGSI_CHAN_W] = lp_build_mul( &bld->bld_base.base, tmp7, tmp1);
-      }
-   }
+static void
+cont_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
  
-   /* dst.w = 1.0 */
-   if (TGSI_IS_DST0_CHANNEL_ENABLED(emit_data->inst, TGSI_CHAN_W) && dims == 3) {
-       emit_data->output[TGSI_CHAN_W] = bld->bld_base.base.one;
-   }
+   lp_exec_continue(&bld->exec_mask);
  }
  
  static void emit_prologue(struct lp_build_tgsi_context * bld_base)
@@ -2104,9 +3540,18 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
                                                  "output_array");
     }
  
+   if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
+      LLVMValueRef array_size =
+         lp_build_const_int32(gallivm,
+                         bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4);
+      bld->imms_array = lp_build_array_alloca(gallivm,
+                                              bld_base->base.vec_type, array_size,
+                                              "imms_array");
+   }
+
     /* If we have indirect addressing in inputs we need to copy them into
      * our alloca array to be able to iterate over them */
-   if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
+   if (bld->indirect_files & (1 << TGSI_FILE_INPUT) && !bld->gs_iface) {
        unsigned index, chan;
        LLVMTypeRef vec_type = bld_base->base.vec_type;
        LLVMValueRef array_size = lp_build_const_int32(gallivm,
@@ -2131,28 +3576,73 @@ static void emit_prologue(struct lp_build_tgsi_context * bld_base)
           }
        }
     }
+
+   if (bld->gs_iface) {
+      struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
+      bld->emitted_prims_vec_ptr =
+         lp_build_alloca(gallivm,
+                         uint_bld->vec_type,
+                         "emitted_prims_ptr");
+      bld->emitted_vertices_vec_ptr =
+         lp_build_alloca(gallivm,
+                         uint_bld->vec_type,
+                         "emitted_vertices_ptr");
+      bld->total_emitted_vertices_vec_ptr =
+         lp_build_alloca(gallivm,
+                         uint_bld->vec_type,
+                         "total_emitted_vertices_ptr");
+
+      LLVMBuildStore(gallivm->builder, uint_bld->zero,
+                     bld->emitted_prims_vec_ptr);
+      LLVMBuildStore(gallivm->builder, uint_bld->zero,
+                     bld->emitted_vertices_vec_ptr);
+      LLVMBuildStore(gallivm->builder, uint_bld->zero,
+                     bld->total_emitted_vertices_vec_ptr);
+   }
+
+   if (DEBUG_EXECUTION) {
+      lp_build_printf(gallivm, "\n");
+      emit_dump_file(bld, TGSI_FILE_CONSTANT);
+      if (!bld->gs_iface)
+         emit_dump_file(bld, TGSI_FILE_INPUT);
+   }
  }
  
  static void emit_epilogue(struct lp_build_tgsi_context * bld_base)
  {
     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
  
-   if (0) {
+   if (DEBUG_EXECUTION) {
        /* for debugging */
-      emit_dump_temps(bld);
+      if (0) {
+         emit_dump_file(bld, TGSI_FILE_TEMPORARY);
+      }
+      emit_dump_file(bld, TGSI_FILE_OUTPUT);
+      lp_build_printf(bld_base->base.gallivm, "\n");
     }
  
     /* If we have indirect addressing in outputs we need to copy our alloca array
-    * to the outputs slots specified by the called */
-   if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
-      unsigned index, chan;
-      assert(bld_base->info->num_outputs <=
-                        bld_base->info->file_max[TGSI_FILE_OUTPUT] + 1);
-      for (index = 0; index < bld_base->info->num_outputs; ++index) {
-         for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
-            bld->outputs[index][chan] = lp_get_output_ptr(bld, index, chan);
-         }
-      }
+    * to the outputs slots specified by the caller */
+   if (bld->gs_iface) {
+      LLVMValueRef total_emitted_vertices_vec;
+      LLVMValueRef emitted_prims_vec;
+      /* implicit end_primitives, needed in case there are any unflushed
+         vertices in the cache. Note must not call end_primitive here
+         since the exec_mask is not valid at this point. */
+      end_primitive_masked(bld_base, lp_build_mask_value(bld->mask));
+      
+      total_emitted_vertices_vec =
+         LLVMBuildLoad(builder, bld->total_emitted_vertices_vec_ptr, "");
+      emitted_prims_vec =
+         LLVMBuildLoad(builder, bld->emitted_prims_vec_ptr, "");
+
+      bld->gs_iface->gs_epilogue(bld->gs_iface,
+                                 &bld->bld_base,
+                                 total_emitted_vertices_vec,
+                                 emitted_prims_vec);
+   } else {
+      gather_outputs(bld);
     }
  }
  
@@ -2162,12 +3652,13 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                    struct lp_type type,
                    struct lp_build_mask_context *mask,
                    LLVMValueRef consts_ptr,
+                  LLVMValueRef const_sizes_ptr,
                    const struct lp_bld_tgsi_system_values *system_values,
-                  const LLVMValueRef *pos,
                    const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
                    LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                    struct lp_build_sampler_soa *sampler,
-                  const struct tgsi_shader_info *info)
+                  const struct tgsi_shader_info *info,
+                  const struct lp_build_tgsi_gs_iface *gs_iface)
  {
     struct lp_build_tgsi_soa_context bld;
  
@@ -2186,15 +3677,36 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
     lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
     lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
     bld.mask = mask;
-   bld.pos = pos;
     bld.inputs = inputs;
     bld.outputs = outputs;
     bld.consts_ptr = consts_ptr;
+   bld.const_sizes_ptr = const_sizes_ptr;
     bld.sampler = sampler;
     bld.bld_base.info = info;
     bld.indirect_files = info->indirect_files;
  
+   /*
+    * If the number of temporaries is rather large then we just
+    * allocate them as an array right from the start and treat
+    * like indirect temporaries.
+    */
+   if (info->file_max[TGSI_FILE_TEMPORARY] >= LP_MAX_INLINED_TEMPS) {
+      bld.indirect_files |= (1 << TGSI_FILE_TEMPORARY);
+   }
+   /*
+    * For performance reason immediates are always backed in a static
+    * array, but if their number is too great, we have to use just
+    * a dynamically allocated array.
+    */
+   bld.use_immediates_array =
+         (info->file_max[TGSI_FILE_IMMEDIATE] >= LP_MAX_INLINED_IMMEDIATES);
+   if (bld.use_immediates_array) {
+      bld.indirect_files |= (1 << TGSI_FILE_IMMEDIATE);
+   }
+
+
     bld.bld_base.soa = TRUE;
+   bld.bld_base.emit_debug = emit_debug;
     bld.bld_base.emit_fetch_funcs[TGSI_FILE_CONSTANT] = emit_fetch_constant;
     bld.bld_base.emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate;
     bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input;
@@ -2214,20 +3726,24 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
     bld.bld_base.op_actions[TGSI_OPCODE_BGNLOOP].emit = bgnloop_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_BGNSUB].emit = bgnsub_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_BRK].emit = brk_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_BREAKC].emit = breakc_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_CAL].emit = cal_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_CASE].emit = case_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_DDX].emit = ddx_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_DDY].emit = ddy_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_DEFAULT].emit = default_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_ENDSUB].emit = endsub_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_ENDSWITCH].emit = endswitch_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_IF].emit = if_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_KIL].emit = kil_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_KILP].emit = kilp_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_NRM].emit = nrm_emit;
-   bld.bld_base.op_actions[TGSI_OPCODE_NRM4].emit = nrm_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_UIF].emit = uif_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_KILL_IF].emit = kill_if_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_KILL].emit = kill_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_RET].emit = ret_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SWITCH].emit = switch_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TEX].emit = tex_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TXB].emit = txb_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TXD].emit = txd_emit;
@@ -2235,8 +3751,45 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
     bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
     bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;
-
-   lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.base);
+   bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit;
+   /* DX10 sampling ops */
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE].emit = sample_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_B].emit = sample_b_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_C].emit = sample_c_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_C_LZ].emit = sample_c_lz_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_D].emit = sample_d_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_I].emit = sample_i_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_L].emit = sample_l_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_SVIEWINFO].emit = sviewinfo_emit;
+
+   if (gs_iface) {
+      /* There's no specific value for this because it should always
+       * be set, but apps using ext_geometry_shader4 quite often
+       * were forgetting so we're using MAX_VERTEX_VARYING from
+       * that spec even though we could debug_assert if it's not
+       * set, but that's a lot uglier. */
+      uint max_output_vertices;
+
+      /* inputs are always indirect with gs */
+      bld.indirect_files |= (1 << TGSI_FILE_INPUT);
+      bld.gs_iface = gs_iface;
+      bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_gs_input;
+      bld.bld_base.op_actions[TGSI_OPCODE_EMIT].emit = emit_vertex;
+      bld.bld_base.op_actions[TGSI_OPCODE_ENDPRIM].emit = end_primitive;
+
+      max_output_vertices =
+            info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+      if (!max_output_vertices)
+         max_output_vertices = 32;
+
+      bld.max_output_vertices_vec =
+         lp_build_const_int_vec(gallivm, bld.bld_base.int_bld.type,
+                                max_output_vertices);
+   }
+
+   lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.int_bld);
  
     bld.system_values = *system_values;
  
@@ -2257,4 +3810,5 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
        LLVMDumpModule(module);
  
     }
+   lp_exec_mask_fini(&bld.exec_mask);
  }