Merge branch 'lp-offset-twoside'
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
index e0c395619397991591822f12c906c1ec4c39e433..7f0f058c2225d75dd53fca86bec148f486f690e6 100644 (file)
@@ -49,6 +49,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
@@ -57,6 +58,7 @@
 #include "lp_bld_tgsi.h"
 #include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_printf.h"
 
 
 #define FOR_EACH_CHANNEL( CHAN )\
@@ -118,8 +120,11 @@ struct lp_build_tgsi_soa_context
 {
    struct lp_build_context base;
 
-   /* Builder for integer masks and indices */
-   struct lp_build_context int_bld;
+   /* Builder for vector integer masks and indices */
+   struct lp_build_context uint_bld;
+
+   /* Builder for scalar elements of shader's data type (float) */
+   struct lp_build_context elem_bld;
 
    LLVMValueRef consts_ptr;
    const LLVMValueRef *pos;
@@ -133,10 +138,27 @@ struct lp_build_tgsi_soa_context
    LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
    LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
 
-   /* we allocate an array of temps if we have indirect
-    * addressing and then the temps above is unused */
+   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
+    * set in the indirect_files field.
+    * The temps[] array above is unused then.
+    */
    LLVMValueRef temps_array;
-   boolean has_indirect_addressing;
+
+   /* We allocate/use this array of output if (1 << TGSI_FILE_OUTPUT) is
+    * set in the indirect_files field.
+    * The outputs[] array above is unused then.
+    */
+   LLVMValueRef outputs_array;
+
+   /* We allocate/use this array of inputs if (1 << TGSI_FILE_INPUT) is
+    * set in the indirect_files field.
+    * The inputs[] array above is unused then.
+    */
+   LLVMValueRef inputs_array;
+
+   const struct tgsi_shader_info *info;
+   /** bitmask indicating which register files are accessed indirectly */
+   unsigned indirect_files;
 
    struct lp_build_mask_context *mask;
    struct lp_exec_mask exec_mask;
@@ -196,8 +218,10 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
    }
    mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
    assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = val;
-
+   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
+                                  mask->cond_mask,
+                                  val,
+                                  "");
    lp_exec_mask_update(mask);
 }
 
@@ -418,7 +442,7 @@ get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
              unsigned chan)
 {
    assert(chan < 4);
-   if (bld->has_indirect_addressing) {
+   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
       LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
       return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
    }
@@ -427,6 +451,26 @@ get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
    }
 }
 
+/**
+ * Return pointer to a output register channel (src or dest).
+ * Note that indirect addressing cannot be handled here.
+ * \param index  which output register
+ * \param chan  which channel of the output register.
+ */
+static LLVMValueRef
+get_output_ptr(struct lp_build_tgsi_soa_context *bld,
+               unsigned index,
+               unsigned chan)
+{
+   assert(chan < 4);
+   if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
+      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
+      return LLVMBuildGEP(bld->base.builder, bld->outputs_array, &lindex, 1, "");
+   }
+   else {
+      return bld->outputs[index][chan];
+   }
+}
 
 /**
  * Gather vector.
@@ -449,7 +493,7 @@ build_gather(struct lp_build_tgsi_soa_context *bld,
       LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
                                                    indexes, ii, "");
       LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
-                                             &index, 1, "");
+                                             &index, 1, "gather_ptr");
       LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
 
       res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
@@ -459,35 +503,100 @@ build_gather(struct lp_build_tgsi_soa_context *bld,
 }
 
 
+/**
+ * Scatter/store vector.
+ */
+static void
+emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
+                  LLVMValueRef base_ptr,
+                  LLVMValueRef indexes,
+                  LLVMValueRef values,
+                  struct lp_exec_mask *mask,
+                  LLVMValueRef pred)
+{
+   LLVMBuilderRef builder = bld->base.builder;
+   unsigned i;
+
+   /* Mix the predicate and execution mask */
+   if (mask->has_mask) {
+      if (pred) {
+         pred = LLVMBuildAnd(mask->bld->builder, pred, mask->exec_mask, "");
+      }
+      else {
+         pred = mask->exec_mask;
+      }
+   }
+
+   /*
+    * Loop over elements of index_vec, store scalar value.
+    */
+   for (i = 0; i < bld->base.type.length; i++) {
+      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef index = LLVMBuildExtractElement(builder, indexes, ii, "");
+      LLVMValueRef scalar_ptr = LLVMBuildGEP(builder, base_ptr, &index, 1, "scatter_ptr");
+      LLVMValueRef val = LLVMBuildExtractElement(builder, values, ii, "scatter_val");
+      LLVMValueRef scalar_pred = pred ?
+         LLVMBuildExtractElement(builder, pred, ii, "scatter_pred") : NULL;
+
+      if (0)
+         lp_build_printf(builder, "scatter %d: val %f at %d %p\n",
+                         ii, val, index, scalar_ptr);
+
+      if (scalar_pred) {
+         LLVMValueRef real_val, dst_val;
+         dst_val = LLVMBuildLoad(builder, scalar_ptr, "");
+         real_val = lp_build_select(&bld->elem_bld, scalar_pred, val, dst_val);
+         LLVMBuildStore(builder, real_val, scalar_ptr);
+      }
+      else {
+         LLVMBuildStore(builder, val, scalar_ptr);
+      }
+   }
+}
+
+
 /**
  * Read the current value of the ADDR register, convert the floats to
- * ints, multiply by four and return the vector of offsets.
+ * ints, add the base index and return the vector of offsets.
  * The offsets will be used to index into the constant buffer or
  * temporary register file.
  */
 static LLVMValueRef
-get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
-                     const struct tgsi_src_register *indirect_reg)
+get_indirect_index(struct lp_build_tgsi_soa_context *bld,
+                   unsigned reg_file, unsigned reg_index,
+                   const struct tgsi_src_register *indirect_reg)
 {
+   struct lp_build_context *uint_bld = &bld->uint_bld;
    /* always use X component of address register */
-   const int x = indirect_reg->SwizzleX;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-   uint swizzle = tgsi_util_get_src_register_swizzle(indirect_reg, x);
-   LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4); 
-   LLVMValueRef addr_vec;
+   unsigned swizzle = indirect_reg->SwizzleX;
+   LLVMValueRef base;
+   LLVMValueRef rel;
+   LLVMValueRef max_index;
+   LLVMValueRef index;
+
+   assert(bld->indirect_files & (1 << reg_file));
+
+   base = lp_build_const_int_vec(uint_bld->type, reg_index);
 
-   addr_vec = LLVMBuildLoad(bld->base.builder,
-                            bld->addr[indirect_reg->Index][swizzle],
-                            "load addr reg");
+   assert(swizzle < 4);
+   rel = LLVMBuildLoad(bld->base.builder,
+                        bld->addr[indirect_reg->Index][swizzle],
+                        "load addr reg");
 
    /* for indexing we want integers */
-   addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec,
-                              int_vec_type, "");
+   rel = LLVMBuildFPToSI(bld->base.builder,
+                         rel,
+                         uint_bld->vec_type, "");
 
-   /* addr_vec = addr_vec * 4 */
-   addr_vec = lp_build_mul(&bld->base, addr_vec, vec4);
+   index = lp_build_add(uint_bld, base, rel);
 
-   return addr_vec;
+   max_index = lp_build_const_int_vec(uint_bld->type,
+                                      bld->info->file_max[reg_file]);
+
+   assert(!uint_bld->type.sign);
+   index = lp_build_min(uint_bld, index, max_index);
+
+   return index;
 }
 
 
@@ -501,11 +610,12 @@ emit_fetch(
    unsigned src_op,
    const unsigned chan_index )
 {
+   struct lp_build_context *uint_bld = &bld->uint_bld;
    const struct tgsi_full_src_register *reg = &inst->Src[src_op];
    const unsigned swizzle =
       tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    LLVMValueRef res;
-   LLVMValueRef addr_vec = NULL;
+   LLVMValueRef indirect_index = NULL;
 
    if (swizzle > 3) {
       assert(0 && "invalid swizzle in emit_fetch()");
@@ -513,20 +623,24 @@ emit_fetch(
    }
 
    if (reg->Register.Indirect) {
-      addr_vec = get_indirect_offsets(bld, &reg->Indirect);
+      indirect_index = get_indirect_index(bld,
+                                          reg->Register.File,
+                                          reg->Register.Index,
+                                          &reg->Indirect);
+   } else {
+      assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]);
    }
 
    switch (reg->Register.File) {
    case TGSI_FILE_CONSTANT:
       if (reg->Register.Indirect) {
+         LLVMValueRef swizzle_vec =
+            lp_build_const_int_vec(uint_bld->type, swizzle);
          LLVMValueRef index_vec;  /* index into the const buffer */
 
-         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
-         index_vec = lp_build_const_int_vec(bld->int_bld.type,
-                                            reg->Register.Index * 4 + swizzle);
-
-         /* index_vec = index_vec + addr_vec */
-         index_vec = lp_build_add(&bld->base, index_vec, addr_vec);
+         /* index_vec = indirect_index * 4 + swizzle */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
 
          /* Gather values from the constant buffer */
          res = build_gather(bld, bld->consts_ptr, index_vec);
@@ -551,33 +665,59 @@ emit_fetch(
       break;
 
    case TGSI_FILE_INPUT:
-      res = bld->inputs[reg->Register.Index][swizzle];
+      if (reg->Register.Indirect) {
+         LLVMValueRef swizzle_vec =
+            lp_build_const_int_vec(uint_bld->type, swizzle);
+         LLVMValueRef length_vec =
+            lp_build_const_int_vec(uint_bld->type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* index into the const buffer */
+         LLVMValueRef inputs_array;
+         LLVMTypeRef float4_ptr_type;
+
+         /* index_vec = (indirect_index * 4 + swizzle) * length */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
+         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+
+         /* cast inputs_array pointer to float* */
+         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         inputs_array = LLVMBuildBitCast(uint_bld->builder, bld->inputs_array,
+                                        float4_ptr_type, "");
+
+         /* Gather values from the temporary register array */
+         res = build_gather(bld, inputs_array, index_vec);
+      } else {
+         if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
+            LLVMValueRef lindex = lp_build_const_int32(reg->Register.Index * 4 + swizzle);
+            LLVMValueRef input_ptr =  LLVMBuildGEP(bld->base.builder,
+                                                   bld->inputs_array, &lindex, 1, "");
+            res = LLVMBuildLoad(bld->base.builder, input_ptr, "");
+         }
+         else {
+            res = bld->inputs[reg->Register.Index][swizzle];
+         }
+      }
       assert(res);
       break;
 
    case TGSI_FILE_TEMPORARY:
       if (reg->Register.Indirect) {
-         LLVMValueRef vec_len =
-            lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length);
+         LLVMValueRef swizzle_vec =
+            lp_build_const_int_vec(uint_bld->type, swizzle);
+         LLVMValueRef length_vec =
+            lp_build_const_int_vec(uint_bld->type, bld->base.type.length);
          LLVMValueRef index_vec;  /* index into the const buffer */
          LLVMValueRef temps_array;
          LLVMTypeRef float4_ptr_type;
 
-         assert(bld->has_indirect_addressing);
-
-         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
-         index_vec = lp_build_const_int_vec(bld->int_bld.type,
-                                            reg->Register.Index * 4 + swizzle);
-
-         /* index_vec += addr_vec */
-         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
-
-         /* index_vec *= vector_length */
-         index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len);
+         /* index_vec = (indirect_index * 4 + swizzle) * length */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
+         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
 
          /* cast temps_array pointer to float* */
          float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
-         temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array,
+         temps_array = LLVMBuildBitCast(uint_bld->builder, bld->temps_array,
                                         float4_ptr_type, "");
 
          /* Gather values from the temporary register array */
@@ -603,13 +743,10 @@ emit_fetch(
       break;
 
    case TGSI_UTIL_SIGN_SET:
-      /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
-      break;
-
+      /* fall through */
    case TGSI_UTIL_SIGN_TOGGLE:
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      res = lp_build_negate( &bld->base, res );
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -730,7 +867,8 @@ emit_store(
    LLVMValueRef value)
 {
    const struct tgsi_full_dst_register *reg = &inst->Dst[index];
-   LLVMValueRef addr = NULL;
+   struct lp_build_context *uint_bld = &bld->uint_bld;
+   LLVMValueRef indirect_index = NULL;
 
    switch( inst->Instruction.Saturate ) {
    case TGSI_SAT_NONE:
@@ -751,32 +889,91 @@ emit_store(
    }
 
    if (reg->Register.Indirect) {
-      /* XXX use get_indirect_offsets() here eventually */
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
-      addr = LLVMBuildLoad(bld->base.builder,
-                           bld->addr[reg->Indirect.Index][swizzle],
-                           "");
-      /* for indexing we want integers */
-      addr = LLVMBuildFPToSI(bld->base.builder, addr,
-                             int_vec_type, "");
-      addr = LLVMBuildExtractElement(bld->base.builder,
-                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
-                                     "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      indirect_index = get_indirect_index(bld,
+                                          reg->Register.File,
+                                          reg->Register.Index,
+                                          &reg->Indirect);
+   } else {
+      assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]);
    }
 
    switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
-      lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         bld->outputs[reg->Register.Index][chan_index]);
+      if (reg->Register.Indirect) {
+         LLVMBuilderRef builder = bld->base.builder;
+         LLVMValueRef chan_vec =
+            lp_build_const_int_vec(uint_bld->type, chan_index);
+         LLVMValueRef length_vec =
+            lp_build_const_int_vec(uint_bld->type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* indexes into the temp registers */
+         LLVMValueRef outputs_array;
+         LLVMValueRef pixel_offsets;
+         LLVMTypeRef float_ptr_type;
+         int i;
+
+         /* build pixel offset vector: {0, 1, 2, 3, ...} */
+         pixel_offsets = uint_bld->undef;
+         for (i = 0; i < bld->base.type.length; i++) {
+            LLVMValueRef ii = lp_build_const_int32(i);
+            pixel_offsets = LLVMBuildInsertElement(builder, pixel_offsets,
+                                                   ii, ii, "");
+         }
+
+         /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, chan_vec);
+         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+         index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
+
+         float_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         outputs_array = LLVMBuildBitCast(builder, bld->outputs_array,
+                                          float_ptr_type, "");
+
+         /* Scatter store values into temp registers */
+         emit_mask_scatter(bld, outputs_array, index_vec, value,
+                           &bld->exec_mask, pred);
+      }
+      else {
+         LLVMValueRef out_ptr = get_output_ptr(bld, reg->Register.Index,
+                                               chan_index);
+         lp_exec_mask_store(&bld->exec_mask, pred, value, out_ptr);
+      }
       break;
 
    case TGSI_FILE_TEMPORARY:
       if (reg->Register.Indirect) {
-         /* XXX not done yet */
-         debug_printf("WARNING: LLVM scatter store of temp regs"
-                      " not implemented\n");
+         LLVMBuilderRef builder = bld->base.builder;
+         LLVMValueRef chan_vec =
+            lp_build_const_int_vec(uint_bld->type, chan_index);
+         LLVMValueRef length_vec =
+            lp_build_const_int_vec(uint_bld->type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* indexes into the temp registers */
+         LLVMValueRef temps_array;
+         LLVMValueRef pixel_offsets;
+         LLVMTypeRef float_ptr_type;
+         int i;
+
+         /* build pixel offset vector: {0, 1, 2, 3, ...} */
+         pixel_offsets = uint_bld->undef; 
+         for (i = 0; i < bld->base.type.length; i++) {
+            LLVMValueRef ii = lp_build_const_int32(i);
+            pixel_offsets = LLVMBuildInsertElement(builder, pixel_offsets,
+                                                   ii, ii, "");
+         }
+
+         /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, chan_vec);
+         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+         index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
+
+         float_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         temps_array = LLVMBuildBitCast(builder, bld->temps_array,
+                                        float_ptr_type, "");
+
+         /* Scatter store values into temp registers */
+         emit_mask_scatter(bld, temps_array, index_vec, value,
+                           &bld->exec_mask, pred);
       }
       else {
          LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
@@ -792,7 +989,7 @@ emit_store(
 
    case TGSI_FILE_PREDICATE:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         bld->preds[index][chan_index]);
+                         bld->preds[reg->Register.Index][chan_index]);
       break;
 
    default:
@@ -805,18 +1002,10 @@ emit_store(
  * High-level instruction translators.
  */
 
-enum tex_modifier {
-   TEX_MODIFIER_NONE = 0,
-   TEX_MODIFIER_PROJECTED,
-   TEX_MODIFIER_LOD_BIAS,
-   TEX_MODIFIER_EXPLICIT_LOD,
-   TEX_MODIFIER_EXPLICIT_DERIV
-};
-
 static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
-          enum tex_modifier modifier,
+          enum lp_build_tex_modifier modifier,
           LLVMValueRef *texel)
 {
    unsigned unit;
@@ -856,11 +1045,11 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       return;
    }
 
-   if (modifier == TEX_MODIFIER_LOD_BIAS) {
+   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
       lod_bias = emit_fetch( bld, inst, 0, 3 );
       explicit_lod = NULL;
    }
-   else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
+   else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
       lod_bias = NULL;
       explicit_lod = emit_fetch( bld, inst, 0, 3 );
    }
@@ -869,36 +1058,40 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       explicit_lod = NULL;
    }
 
-   if (modifier == TEX_MODIFIER_PROJECTED) {
+   if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
       oow = emit_fetch( bld, inst, 0, 3 );
       oow = lp_build_rcp(&bld->base, oow);
    }
 
    for (i = 0; i < num_coords; i++) {
       coords[i] = emit_fetch( bld, inst, 0, i );
-      if (modifier == TEX_MODIFIER_PROJECTED)
+      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
    for (i = num_coords; i < 3; i++) {
       coords[i] = bld->base.undef;
    }
 
-   if (modifier == TEX_MODIFIER_EXPLICIT_DERIV) {
+   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      LLVMTypeRef i32t = LLVMInt32Type();
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = emit_fetch( bld, inst, 1, i );
-         ddy[i] = emit_fetch( bld, inst, 2, i );
+         LLVMValueRef src1 = emit_fetch( bld, inst, 1, i );
+         LLVMValueRef src2 = emit_fetch( bld, inst, 2, i );
+         ddx[i] = LLVMBuildExtractElement(bld->base.builder, src1, index0, "");
+         ddy[i] = LLVMBuildExtractElement(bld->base.builder, src2, index0, "");
       }
       unit = inst->Src[3].Register.Index;
    }  else {
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
-         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
+         ddx[i] = lp_build_scalar_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_scalar_ddy( &bld->base, coords[i] );
       }
       unit = inst->Src[1].Register.Index;
    }
    for (i = num_coords; i < 3; i++) {
-      ddx[i] = bld->base.undef;
-      ddy[i] = bld->base.undef;
+      ddx[i] = LLVMGetUndef(bld->base.elem_type);
+      ddy[i] = LLVMGetUndef(bld->base.elem_type);
    }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
@@ -910,6 +1103,43 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                   texel);
 }
 
+static boolean
+near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
+                  int pc)
+{
+   int i;
+
+   for (i = 0; i < 5; i++) {
+      unsigned opcode;
+
+      if (pc + i >= bld->info->num_instructions)
+        return TRUE;
+
+      opcode = bld->instructions[pc + i].Instruction.Opcode;
+
+      if (opcode == TGSI_OPCODE_END)
+        return TRUE;
+
+      if (opcode == TGSI_OPCODE_TEX ||
+         opcode == TGSI_OPCODE_TXP ||
+         opcode == TGSI_OPCODE_TXD ||
+         opcode == TGSI_OPCODE_TXB ||
+         opcode == TGSI_OPCODE_TXL ||
+         opcode == TGSI_OPCODE_TXF ||
+         opcode == TGSI_OPCODE_TXQ ||
+         opcode == TGSI_OPCODE_CAL ||
+         opcode == TGSI_OPCODE_CALLNZ ||
+         opcode == TGSI_OPCODE_IF ||
+         opcode == TGSI_OPCODE_IFC ||
+         opcode == TGSI_OPCODE_BGNLOOP ||
+         opcode == TGSI_OPCODE_SWITCH)
+        return FALSE;
+   }
+
+   return TRUE;
+}
+
+
 
 /**
  * Kill fragment if any of the src register values are negative.
@@ -917,7 +1147,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 static void
 emit_kil(
    struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   int pc)
 {
    const struct tgsi_full_src_register *reg = &inst->Src[0];
    LLVMValueRef terms[NUM_CHANNELS];
@@ -956,8 +1187,12 @@ emit_kil(
       }
    }
 
-   if(mask)
+   if(mask) {
       lp_build_mask_update(bld->mask, mask);
+
+      if (!near_end_of_shader(bld, pc))
+        lp_build_mask_check(bld->mask);
+   }
 }
 
 
@@ -969,7 +1204,8 @@ emit_kil(
  */
 static void
 emit_kilp(struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst)
+          const struct tgsi_full_instruction *inst,
+         int pc)
 {
    LLVMValueRef mask;
 
@@ -980,57 +1216,104 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
       mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
    }
    else {
-      mask = bld->base.zero;
+      LLVMValueRef zero = LLVMConstNull(bld->base.int_vec_type);
+      mask = zero;
    }
 
    lp_build_mask_update(bld->mask, mask);
+
+   if (!near_end_of_shader(bld, pc))
+      lp_build_mask_check(bld->mask);
+}
+
+
+/**
+ * Emit code which will dump the value of all the temporary registers
+ * to stdout.
+ */
+static void
+emit_dump_temps(struct lp_build_tgsi_soa_context *bld)
+{
+   LLVMBuilderRef builder = bld->base.builder;
+   LLVMValueRef temp_ptr;
+   LLVMValueRef i0 = lp_build_const_int32(0);
+   LLVMValueRef i1 = lp_build_const_int32(1);
+   LLVMValueRef i2 = lp_build_const_int32(2);
+   LLVMValueRef i3 = lp_build_const_int32(3);
+   int index;
+   int n = bld->info->file_max[TGSI_FILE_TEMPORARY];
+
+   for (index = 0; index < n; index++) {
+      LLVMValueRef idx = lp_build_const_int32(index);
+      LLVMValueRef v[4][4], res;
+      int chan;
+
+      lp_build_printf(builder, "TEMP[%d]:\n", idx);
+
+      for (chan = 0; chan < 4; chan++) {
+         temp_ptr = get_temp_ptr(bld, index, chan);
+         res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
+         v[chan][0] = LLVMBuildExtractElement(builder, res, i0, "");
+         v[chan][1] = LLVMBuildExtractElement(builder, res, i1, "");
+         v[chan][2] = LLVMBuildExtractElement(builder, res, i2, "");
+         v[chan][3] = LLVMBuildExtractElement(builder, res, i3, "");
+      }
+
+      lp_build_printf(builder, "  X: %f %f %f %f\n",
+                      v[0][0], v[0][1], v[0][2], v[0][3]);
+      lp_build_printf(builder, "  Y: %f %f %f %f\n",
+                      v[1][0], v[1][1], v[1][2], v[1][3]);
+      lp_build_printf(builder, "  Z: %f %f %f %f\n",
+                      v[2][0], v[2][1], v[2][2], v[2][3]);
+      lp_build_printf(builder, "  W: %f %f %f %f\n",
+                      v[3][0], v[3][1], v[3][2], v[3][3]);
+   }
 }
 
+
+
 static void
 emit_declaration(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_declaration *decl)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
-
-   unsigned first = decl->Range.First;
-   unsigned last = decl->Range.Last;
+   LLVMTypeRef vec_type = bld->base.vec_type;
+   const unsigned first = decl->Range.First;
+   const unsigned last = decl->Range.Last;
    unsigned idx, i;
 
    for (idx = first; idx <= last; ++idx) {
+      assert(last <= bld->info->file_max[decl->Declaration.File]);
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
          assert(idx < LP_MAX_TGSI_TEMPS);
-         if (bld->has_indirect_addressing) {
-            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
-                                                   last*4 + 4, 0);
-            bld->temps_array = lp_build_array_alloca(bld->base.builder,
-                                                     vec_type, array_size, "");
-         } else {
+         if (!(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY))) {
             for (i = 0; i < NUM_CHANNELS; i++)
                bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
-                                                    vec_type, "");
+                                                    vec_type, "temp");
          }
          break;
 
       case TGSI_FILE_OUTPUT:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
-                                                   vec_type, "");
+         if (!(bld->indirect_files & (1 << TGSI_FILE_OUTPUT))) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
+                                                      vec_type, "output");
+         }
          break;
 
       case TGSI_FILE_ADDRESS:
          assert(idx < LP_MAX_TGSI_ADDRS);
          for (i = 0; i < NUM_CHANNELS; i++)
             bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
-                                                vec_type, "");
+                                                vec_type, "addr");
          break;
 
       case TGSI_FILE_PREDICATE:
          assert(idx < LP_MAX_TGSI_PREDS);
          for (i = 0; i < NUM_CHANNELS; i++)
             bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
-                                                 vec_type, "");
+                                                 vec_type, "predicate");
          break;
 
       default:
@@ -1531,12 +1814,12 @@ emit_instruction(
 
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld, inst );
+      emit_kilp( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
-      emit_kil( bld, inst );
+      emit_kil( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_PK2H:
@@ -1616,11 +1899,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TEX:
-      emit_tex( bld, inst, TEX_MODIFIER_NONE, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_NONE, dst0 );
       break;
 
    case TGSI_OPCODE_TXD:
-      emit_tex( bld, inst, TEX_MODIFIER_EXPLICIT_DERIV, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, dst0 );
       break;
 
    case TGSI_OPCODE_UP2H:
@@ -1685,6 +1968,10 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_END:
+      if (0) {
+         /* for debugging */
+         emit_dump_temps(bld);
+      }
       *pc = -1;
       break;
 
@@ -1724,7 +2011,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXB:
-      emit_tex( bld, inst, TEX_MODIFIER_LOD_BIAS, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, dst0 );
       break;
 
    case TGSI_OPCODE_NRM:
@@ -1829,11 +2116,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( bld, inst, TEX_MODIFIER_EXPLICIT_LOD, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, dst0 );
       break;
 
    case TGSI_OPCODE_TXP:
-      emit_tex( bld, inst, TEX_MODIFIER_PROJECTED, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED, dst0 );
       break;
 
    case TGSI_OPCODE_BRK:
@@ -2015,18 +2302,27 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    unsigned i;
    int pc = 0;
 
+   struct lp_type res_type;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = type.width;
+   res_type.length = type.length;
+   res_type.sign = 1;
+
    /* Setup build context */
    memset(&bld, 0, sizeof bld);
    lp_build_context_init(&bld.base, builder, type);
-   lp_build_context_init(&bld.int_bld, builder, lp_int_type(type));
+   lp_build_context_init(&bld.uint_bld, builder, lp_uint_type(type));
+   lp_build_context_init(&bld.elem_bld, builder, lp_elem_type(type));
    bld.mask = mask;
    bld.pos = pos;
    bld.inputs = inputs;
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
    bld.sampler = sampler;
-   bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 ||
-                                 info->opcode_count[TGSI_OPCODE_ARL] > 0;
+   bld.info = info;
+   bld.indirect_files = info->indirect_files;
    bld.instructions = (struct tgsi_full_instruction *)
                       MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
    bld.max_instructions = LP_MAX_INSTRUCTIONS;
@@ -2037,6 +2333,48 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
 
    lp_exec_mask_init(&bld.exec_mask, &bld.base);
 
+   if (bld.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+      LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
+                                             info->file_max[TGSI_FILE_TEMPORARY]*4 + 4, 0);
+      bld.temps_array = lp_build_array_alloca(bld.base.builder,
+                                              bld.base.vec_type, array_size,
+                                              "temp_array");
+   }
+
+   if (bld.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
+      LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
+                                             info->file_max[TGSI_FILE_OUTPUT]*4 + 4, 0);
+      bld.outputs_array = lp_build_array_alloca(bld.base.builder,
+                                                bld.base.vec_type, array_size,
+                                                "output_array");
+   }
+
+   /* If we have indirect addressing in inputs we need to copy them into
+    * our alloca array to be able to iterate over them */
+   if (bld.indirect_files & (1 << TGSI_FILE_INPUT)) {
+      unsigned index, chan;
+      LLVMTypeRef vec_type = bld.base.vec_type;
+      LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
+                                             info->file_max[TGSI_FILE_INPUT]*4 + 4, 0);
+      bld.inputs_array = lp_build_array_alloca(bld.base.builder,
+                                               vec_type, array_size,
+                                               "input_array");
+
+      assert(info->num_inputs <= info->file_max[TGSI_FILE_INPUT] + 1);
+
+      for (index = 0; index < info->num_inputs; ++index) {
+         for (chan = 0; chan < NUM_CHANNELS; ++chan) {
+            LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
+            LLVMValueRef input_ptr =
+               LLVMBuildGEP(bld.base.builder, bld.inputs_array,
+                            &lindex, 1, "");
+            LLVMValueRef value = bld.inputs[index][chan];
+            if (value)
+               LLVMBuildStore(bld.base.builder, value, input_ptr);
+         }
+      }
+   }
+
    tgsi_parse_init( &parse, tokens );
 
    while( !tgsi_parse_end_of_tokens( &parse ) ) {
@@ -2052,11 +2390,16 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          {
             /* save expanded instruction */
             if (num_instructions == bld.max_instructions) {
-               bld.instructions = REALLOC(bld.instructions,
-                                          bld.max_instructions
-                                          * sizeof(struct tgsi_full_instruction),
-                                          (bld.max_instructions + LP_MAX_INSTRUCTIONS)
-                                          * sizeof(struct tgsi_full_instruction));
+               struct tgsi_full_instruction *instructions;
+               instructions = REALLOC(bld.instructions,
+                                      bld.max_instructions
+                                      * sizeof(struct tgsi_full_instruction),
+                                      (bld.max_instructions + LP_MAX_INSTRUCTIONS)
+                                      * sizeof(struct tgsi_full_instruction));
+               if (!instructions) {
+                  break;
+               }
+               bld.instructions = instructions;
                bld.max_instructions += LP_MAX_INSTRUCTIONS;
             }
 
@@ -2101,6 +2444,18 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
                        opcode_info->mnemonic);
    }
 
+   /* If we have indirect addressing in outputs we need to copy our alloca array
+    * to the outputs slots specified by the called */
+   if (bld.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
+      unsigned index, chan;
+      assert(info->num_outputs <= info->file_max[TGSI_FILE_OUTPUT] + 1);
+      for (index = 0; index < info->num_outputs; ++index) {
+         for (chan = 0; chan < NUM_CHANNELS; ++chan) {
+            bld.outputs[index][chan] = get_output_ptr(&bld, index, chan);
+         }
+      }
+   }
+
    if (0) {
       LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
       LLVMValueRef function = LLVMGetBasicBlockParent(block);