gallivm: fix bug in nested conditionals
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_tgsi_soa.c
index 6e1e41fdeed468aaab40d3adafa6cc4b13b00947..0e07f7f3f38e2666cb57592c4be6539f0b37b863 100644 (file)
 
 #include "pipe/p_config.h"
 #include "pipe/p_shader_tokens.h"
-#include "cso_cache/cso_hash.h"
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_string.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_scan.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
 #define CHAN_Y 1
 #define CHAN_Z 2
 #define CHAN_W 3
+#define NUM_CHANNELS 4
 
-#define QUAD_TOP_LEFT     0
-#define QUAD_TOP_RIGHT    1
-#define QUAD_BOTTOM_LEFT  2
-#define QUAD_BOTTOM_RIGHT 3
+#define LP_MAX_INSTRUCTIONS 256
 
 
 struct lp_exec_mask {
@@ -99,7 +96,6 @@ struct lp_exec_mask {
    LLVMValueRef cont_mask;
    LLVMValueRef break_mask;
    LLVMValueRef break_var;
-   LLVMValueRef ret_mask;
    struct {
       LLVMBasicBlockRef loop_block;
       LLVMValueRef cont_mask;
@@ -108,6 +104,13 @@ struct lp_exec_mask {
    } loop_stack[LP_MAX_TGSI_NESTING];
    int loop_stack_size;
 
+   LLVMValueRef ret_mask;
+   struct {
+      int pc;
+      LLVMValueRef ret_mask;
+   } call_stack[LP_MAX_TGSI_NESTING];
+   int call_stack_size;
+
    LLVMValueRef exec_mask;
 };
 
@@ -120,226 +123,42 @@ struct lp_build_tgsi_soa_context
 
    LLVMValueRef consts_ptr;
    const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[NUM_CHANNELS];
+   LLVMValueRef (*outputs)[NUM_CHANNELS];
 
    const struct lp_build_sampler_soa *sampler;
 
-   const LLVMValueRef (*inputs)[NUM_CHANNELS];
-   LLVMValueRef (*outputs)[NUM_CHANNELS];
    LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][NUM_CHANNELS];
    LLVMValueRef temps[LP_MAX_TGSI_TEMPS][NUM_CHANNELS];
    LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
    LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
 
-   /* we allocate an array of allocas if we have indirect
-    * addressing and then the temps above is unused */
+   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
+    * set in the indirect_files field.
+    * The temps[] array above is unused then.
+    */
    LLVMValueRef temps_array;
 
-   LLVMValueRef inputs_array;
-   LLVMValueRef outputs_array;
-   LLVMValueRef immediates_array;
-   LLVMValueRef addrs_array;
-   LLVMValueRef preds_array;
-
-   boolean has_indirect_addressing;
-   boolean has_function_calls;
+   /** bitmask indicating which register files are accessed indirectly */
+   unsigned indirect_files;
 
    struct lp_build_mask_context *mask;
    struct lp_exec_mask exec_mask;
 
-   struct cso_hash *func_hash;
-   unsigned instno;
-   LLVMBasicBlockRef main_block;
-
-   struct {
-      struct tgsi_declaration_range inputs;
-      struct tgsi_declaration_range outputs;
-      struct tgsi_declaration_range temps;
-      struct tgsi_declaration_range addrs;
-      struct tgsi_declaration_range preds;
-   } full_range;
+   struct tgsi_full_instruction *instructions;
+   uint max_instructions;
 };
 
-static const unsigned char
-swizzle_left[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
-};
-
-static const unsigned char
-swizzle_right[4] = {
-   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
-   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
-};
-
-static const unsigned char
-swizzle_top[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
-};
-
-static const unsigned char
-swizzle_bottom[4] = {
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
-};
-
-
-static LLVMValueRef
-get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
-             unsigned index,
-             unsigned chan,
-             boolean is_indirect,
-             LLVMValueRef addr)
-{
-   assert(chan < 4);
-   if (!bld->has_indirect_addressing &&
-       !bld->has_function_calls) {
-      return bld->temps[index][chan];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index * 4 + chan, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
-      return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "temp_ptr");
-   }
-}
-
-static LLVMValueRef
-get_input_ptr(struct lp_build_tgsi_soa_context *bld,
-             unsigned index,
-             unsigned swizzle,
-             boolean is_indirect,
-             LLVMValueRef addr)
-{
-   LLVMValueRef lindex =
-      LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
-   assert(bld->has_function_calls);
-   if (is_indirect)
-      lindex = lp_build_add(&bld->base, lindex, addr);
-   return LLVMBuildGEP(bld->base.builder, bld->inputs_array, &lindex, 1, "input_ptr");
-}
-
-static LLVMValueRef
-get_output_ptr(struct lp_build_tgsi_soa_context *bld,
-             unsigned index,
-             unsigned swizzle,
-             boolean is_indirect,
-             LLVMValueRef addr)
-{
-   if (!bld->has_function_calls) {
-      return bld->outputs[index][swizzle];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
-      return LLVMBuildGEP(bld->base.builder, bld->outputs_array, &lindex, 1, "output_ptr");
-   }
-}
-
-static LLVMValueRef
-get_immediates_ptr(struct lp_build_tgsi_soa_context *bld,
-                   unsigned index,
-                   unsigned swizzle,
-                   boolean is_indirect,
-                   LLVMValueRef addr)
-{
-   LLVMValueRef lindex =
-      LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
-   assert(bld->has_function_calls);
-   if (is_indirect)
-      lindex = lp_build_add(&bld->base, lindex, addr);
-   return LLVMBuildGEP(bld->base.builder, bld->immediates_array, &lindex, 1, "immediates_ptr");
-}
-
-static LLVMValueRef
-get_addr_ptr(struct lp_build_tgsi_soa_context *bld,
-             unsigned index,
-             unsigned swizzle,
-             boolean is_indirect,
-             LLVMValueRef addr)
-{
-   if (!bld->has_function_calls) {
-      return bld->addr[index][swizzle];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
-      return LLVMBuildGEP(bld->base.builder, bld->addrs_array, &lindex, 1, "addrs_ptr");
-   }
-}
-
-static LLVMValueRef
-get_preds_ptr(struct lp_build_tgsi_soa_context *bld,
-              unsigned index,
-              unsigned swizzle,
-              boolean is_indirect,
-              LLVMValueRef addr)
-{
-   if (!bld->has_function_calls) {
-      return bld->preds[index][swizzle];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
-      return LLVMBuildGEP(bld->base.builder, bld->preds_array, &lindex, 1, "preds_ptr");
-   }
-}
-
-static LLVMValueRef lp_get_function(struct lp_build_tgsi_soa_context *bld,
-                                    int label)
-{
-   struct cso_hash *hash = bld->func_hash;
-   struct cso_hash_iter iter = cso_hash_find(hash, label);
-   LLVMValueRef func;
-   LLVMModuleRef module = LLVMGetGlobalParent(
-      LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld->base.builder)));
-
-   if (cso_hash_iter_is_null(iter)) {
-      LLVMTypeRef func_type;
-      LLVMTypeRef arg_types[7];
-      LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
-      int i;
-      char func_name[32];
-
-      util_snprintf(func_name, 31, "func%d", label);
-
-      arg_types[0] = LLVMPointerType(vec_type, 0);  /* inputs */
-      arg_types[1] = LLVMPointerType(vec_type, 0);  /* outpus */
-      arg_types[2] = LLVMTypeOf(bld->consts_ptr);   /* consts */
-      arg_types[3] = LLVMPointerType(vec_type, 0);  /* temps */
-      arg_types[4] = LLVMPointerType(vec_type, 0);  /* addrs */
-      arg_types[5] = LLVMPointerType(vec_type, 0);  /* preds */
-      arg_types[6] = LLVMPointerType(vec_type, 0);  /* immediates */
-
-      func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
-
-      func = LLVMAddFunction(module, func_name, func_type);
-      LLVMSetFunctionCallConv(func, LLVMCCallConv);
-      for(i = 0; i < Elements(arg_types); ++i)
-         if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
-            LLVMAddAttribute(LLVMGetParam(func, i), LLVMNoAliasAttribute);
-
-      cso_hash_insert(hash, label, func);
-   } else {
-      func = (LLVMValueRef)cso_hash_iter_data(iter);
-   }
-
-   return func;
-}
-
 static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
 {
    mask->bld = bld;
    mask->has_mask = FALSE;
    mask->cond_stack_size = 0;
    mask->loop_stack_size = 0;
-   mask->ret_mask = 0;
+   mask->call_stack_size = 0;
 
    mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
-   mask->break_mask = mask->cont_mask = mask->cond_mask =
+   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask =
          LLVMConstAllOnes(mask->int_vec_type);
 }
 
@@ -360,17 +179,16 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
    } else
       mask->exec_mask = mask->cond_mask;
 
-   if (mask->ret_mask) {
+   if (mask->call_stack_size) {
       mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
                                      mask->exec_mask,
                                      mask->ret_mask,
-                                     "retmask");
+                                     "callmask");
    }
 
-
    mask->has_mask = (mask->cond_stack_size > 0 ||
                      mask->loop_stack_size > 0 ||
-                     mask->ret_mask);
+                     mask->call_stack_size > 0);
 }
 
 static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
@@ -382,8 +200,10 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
    }
    mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
    assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = val;
-
+   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
+                                  mask->cond_mask,
+                                  val,
+                                  "");
    lp_exec_mask_update(mask);
 }
 
@@ -413,13 +233,6 @@ static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
    lp_exec_mask_update(mask);
 }
 
-
-static void lp_exec_bgnsub(struct lp_exec_mask *mask)
-{
-   mask->exec_mask = LLVMConstAllOnes(mask->int_vec_type);
-   mask->ret_mask = LLVMConstAllOnes(mask->int_vec_type);
-}
-
 static void lp_exec_bgnloop(struct lp_exec_mask *mask)
 {
    if (mask->loop_stack_size == 0) {
@@ -462,21 +275,6 @@ static void lp_exec_break(struct lp_exec_mask *mask)
    lp_exec_mask_update(mask);
 }
 
-
-static void lp_exec_ret(struct lp_exec_mask *mask)
-{
-   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
-                                         mask->exec_mask,
-                                         "ret");
-
-   mask->ret_mask = LLVMBuildAnd(mask->bld->builder,
-                                 mask->ret_mask,
-                                 exec_mask, "ret_full");
-
-   lp_exec_mask_update(mask);
-}
-
-
 static void lp_exec_continue(struct lp_exec_mask *mask)
 {
    LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
@@ -569,156 +367,136 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask,
       LLVMBuildStore(mask->bld->builder, val, dst);
 }
 
-static LLVMValueRef
-emit_vec_alloca_array(struct lp_build_tgsi_soa_context *bld,
-                      LLVMTypeRef vec_type,
-                      int size)
+static void lp_exec_mask_call(struct lp_exec_mask *mask,
+                              int func,
+                              int *pc)
 {
-   LLVMValueRef val = LLVMConstInt(LLVMInt32Type(),
-                                   size * 4 + 4, 0);
-   return lp_build_array_alloca(bld->base.builder,
-                                vec_type, val, "");
+   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
+   mask->call_stack[mask->call_stack_size].pc = *pc;
+   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
+   mask->call_stack_size++;
+   *pc = func;
 }
 
-static void
-emit_preamble(struct lp_build_tgsi_soa_context *bld, uint num_immediates)
+static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
+   LLVMValueRef exec_mask;
 
-   /* temps */
-   if (bld->has_indirect_addressing ||
-       bld->has_function_calls) {
-      int size = bld->full_range.temps.Last + 1;
-      bld->temps_array = emit_vec_alloca_array(bld, vec_type, size);
+   if (mask->call_stack_size == 0) {
+      /* returning from main() */
+      *pc = -1;
+      return;
    }
-   if (bld->has_function_calls) {
-      int i;
-      int size = bld->full_range.outputs.Last + 1;
-      bld->outputs_array  = emit_vec_alloca_array(bld, vec_type, size);
-
-      /* we need to insert the created immediates into our array */
-      size = num_immediates;
-      if (size > 0)
-         bld->immediates_array  = emit_vec_alloca_array(bld, vec_type, size);
-      for (i = 0; i < size; ++i) {
-         int j;
-         for (j = 0; j < NUM_CHANNELS; ++j) {
-            LLVMValueRef ptr = get_immediates_ptr(bld,
-                                                  i, j,
-                                                  FALSE, 0);
-            LLVMBuildStore(bld->base.builder,
-                           bld->immediates[i][j],
-                           ptr);
-         }
-      }
+   exec_mask = LLVMBuildNot(mask->bld->builder,
+                            mask->exec_mask,
+                            "ret");
 
-      size = bld->full_range.addrs.Last + 1;
-      bld->addrs_array  = emit_vec_alloca_array(bld, vec_type, size);
-
-      size = bld->full_range.preds.Last + 1;
-      bld->preds_array  = emit_vec_alloca_array(bld, vec_type, size);
-
-      /*inputs also need to be copied*/
-      size = bld->full_range.inputs.Last + 1;
-      bld->inputs_array  = emit_vec_alloca_array(bld, vec_type, size);
-      for (i = bld->full_range.inputs.First; i < size; ++i) {
-         int j;
-         for (j = 0; j < NUM_CHANNELS; ++j) {
-            LLVMValueRef ptr = get_input_ptr(bld,
-                                             i, j,
-                                             FALSE, 0);
-            LLVMBuildStore(bld->base.builder,
-                           bld->inputs[i][j],
-                           ptr);
-         }
-      }
-   }
+   mask->ret_mask = LLVMBuildAnd(mask->bld->builder,
+                                 mask->ret_mask,
+                                 exec_mask, "ret_full");
+
+   lp_exec_mask_update(mask);
 }
 
-static void
-emit_end(struct lp_build_tgsi_soa_context *bld)
+static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
 {
-   int i, j;
-
-   bld->main_block = LLVMGetInsertBlock(bld->base.builder);
-
-   /* if we had function calls we want to propagate the
-    * outputs from the array to the values */
-   if (bld->has_function_calls) {
-      int size = bld->full_range.outputs.Last + 1;
-      for (i = bld->full_range.outputs.First; i < size; ++i) {
-         for (j = 0; j < NUM_CHANNELS; ++j) {
-            LLVMValueRef ptr = get_output_ptr(bld, i, j,
-                                              FALSE, 0);
-            bld->outputs[i][j] = ptr;
-         }
-      }
-   }
 }
 
-static void
-emit_bgnsub(struct lp_build_tgsi_soa_context *bld)
+static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
 {
-   LLVMValueRef func = lp_get_function(bld, bld->instno);
-   LLVMBasicBlockRef block;
-   LLVMValueRef inputs_ptr, outputs_ptr,
-      consts_ptr, temps_ptr, addrs_ptr, preds_ptr, imms_ptr;
-
-   inputs_ptr  = LLVMGetParam(func, 0);
-   outputs_ptr  = LLVMGetParam(func, 1);
-   consts_ptr  = LLVMGetParam(func, 2);
-   temps_ptr  = LLVMGetParam(func, 3);
-   addrs_ptr  = LLVMGetParam(func, 4);
-   preds_ptr  = LLVMGetParam(func, 5);
-   imms_ptr  = LLVMGetParam(func, 6);
-
-   lp_build_name(inputs_ptr, "inputs");
-   lp_build_name(outputs_ptr, "outputs");
-   lp_build_name(consts_ptr, "consts");
-   lp_build_name(temps_ptr, "temps");
-   lp_build_name(addrs_ptr, "addrs");
-   lp_build_name(preds_ptr, "preds");
-   lp_build_name(imms_ptr, "immediates");
-
-   bld->inputs_array = inputs_ptr;
-   bld->outputs_array = outputs_ptr;
-   bld->consts_ptr = consts_ptr;
-   bld->temps_array = temps_ptr;
-   bld->addrs_array = addrs_ptr;
-   bld->preds_array = preds_ptr;
-   bld->immediates_array = imms_ptr;
-
-   block = LLVMAppendBasicBlock(func, "entry");
-   LLVMPositionBuilderAtEnd(bld->base.builder, block);
-
-   lp_exec_bgnsub(&bld->exec_mask);
+   assert(mask->call_stack_size);
+   mask->call_stack_size--;
+   *pc = mask->call_stack[mask->call_stack_size].pc;
+   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
+   lp_exec_mask_update(mask);
 }
 
-static void
-emit_endsub(struct lp_build_tgsi_soa_context *bld)
+
+/**
+ * Return pointer to a temporary register channel (src or dest).
+ * Note that indirect addressing cannot be handled here.
+ * \param index  which temporary register
+ * \param chan  which channel of the temp register.
+ */
+static LLVMValueRef
+get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
+             unsigned index,
+             unsigned chan)
 {
-   LLVMBuildRetVoid(bld->base.builder);
+   assert(chan < 4);
+   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
+      return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
+   }
+   else {
+      return bld->temps[index][chan];
+   }
 }
 
+
+/**
+ * Gather vector.
+ * XXX the lp_build_gather() function should be capable of doing this
+ * with a little work.
+ */
 static LLVMValueRef
-emit_ddx(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
+build_gather(struct lp_build_tgsi_soa_context *bld,
+             LLVMValueRef base_ptr,
+             LLVMValueRef indexes)
 {
-   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
-   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
-   return lp_build_sub(&bld->base, src_right, src_left);
+   LLVMValueRef res = bld->base.undef;
+   unsigned i;
+
+   /*
+    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
+    */
+   for (i = 0; i < bld->base.type.length; i++) {
+      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
+                                                   indexes, ii, "");
+      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
+                                             &index, 1, "");
+      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+
+      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
+   }
+
+   return res;
 }
 
 
+/**
+ * Read the current value of the ADDR register, convert the floats to
+ * ints, multiply by four and return the vector of offsets.
+ * The offsets will be used to index into the constant buffer or
+ * temporary register file.
+ */
 static LLVMValueRef
-emit_ddy(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
+get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
+                     const struct tgsi_src_register *indirect_reg)
 {
-   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
-   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
-   return lp_build_sub(&bld->base, src_top, src_bottom);
+   /* always use X component of address register */
+   const int x = indirect_reg->SwizzleX;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
+   uint swizzle = tgsi_util_get_src_register_swizzle(indirect_reg, x);
+   LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4); 
+   LLVMValueRef addr_vec;
+
+   addr_vec = LLVMBuildLoad(bld->base.builder,
+                            bld->addr[indirect_reg->Index][swizzle],
+                            "load addr reg");
+
+   /* for indexing we want integers */
+   addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec,
+                              int_vec_type, "");
+
+   /* addr_vec = addr_vec * 4 */
+   addr_vec = lp_build_mul(&bld->int_bld, addr_vec, vec4);
+
+   return addr_vec;
 }
 
+
 /**
  * Register fetch.
  */
@@ -726,14 +504,14 @@ static LLVMValueRef
 emit_fetch(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
-   unsigned index,
+   unsigned src_op,
    const unsigned chan_index )
 {
-   const struct tgsi_full_src_register *reg = &inst->Src[index];
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
    const unsigned swizzle =
       tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    LLVMValueRef res;
-   LLVMValueRef addr = NULL;
+   LLVMValueRef addr_vec = NULL;
 
    if (swizzle > 3) {
       assert(0 && "invalid swizzle in emit_fetch()");
@@ -741,32 +519,33 @@ emit_fetch(
    }
 
    if (reg->Register.Indirect) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
-      addr = LLVMBuildLoad(bld->base.builder,
-                           get_addr_ptr(bld, reg->Indirect.Index, swizzle, FALSE, 0),
-                           "");
-      /* for indexing we want integers */
-      addr = LLVMBuildFPToSI(bld->base.builder, addr,
-                             int_vec_type, "");
-      addr = LLVMBuildExtractElement(bld->base.builder,
-                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
-                                     "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      assert(bld->indirect_files);
+      addr_vec = get_indirect_offsets(bld, &reg->Indirect);
    }
 
    switch (reg->Register.File) {
    case TGSI_FILE_CONSTANT:
-      {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(),
-                                           reg->Register.Index*4 + swizzle, 0);
+      if (reg->Register.Indirect) {
+         LLVMValueRef index_vec;  /* index into the const buffer */
+
+         assert(bld->indirect_files & (1 << TGSI_FILE_CONSTANT));
+
+         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
+         index_vec = lp_build_const_int_vec(bld->int_bld.type,
+                                            reg->Register.Index * 4 + swizzle);
+
+         /* index_vec = index_vec + addr_vec */
+         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
+
+         /* Gather values from the constant buffer */
+         res = build_gather(bld, bld->consts_ptr, index_vec);
+      }
+      else {
+         LLVMValueRef index;  /* index into the const buffer */
          LLVMValueRef scalar, scalar_ptr;
 
-         if (reg->Register.Indirect) {
-            /*lp_build_printf(bld->base.builder,
-              "\taddr = %d\n", addr);*/
-            index = lp_build_add(&bld->base, index, addr);
-         }
+         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
+
          scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
                                    &index, 1, "");
          scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
@@ -776,37 +555,48 @@ emit_fetch(
       break;
 
    case TGSI_FILE_IMMEDIATE:
-      if (bld->has_function_calls) {
-         LLVMValueRef ptr = get_immediates_ptr(bld,
-                                               reg->Register.Index,
-                                               swizzle,
-                                               FALSE, 0);
-         res = LLVMBuildLoad(bld->base.builder, ptr, "");
-      } else
-         res = bld->immediates[reg->Register.Index][swizzle];
+      res = bld->immediates[reg->Register.Index][swizzle];
       assert(res);
       break;
 
    case TGSI_FILE_INPUT:
-      if (bld->has_function_calls) {
-         LLVMValueRef ptr = get_input_ptr(bld,
-                                          reg->Register.Index,
-                                          swizzle,
-                                          FALSE, 0);
-         res = LLVMBuildLoad(bld->base.builder, ptr, "");
-      } else
-         res = bld->inputs[reg->Register.Index][swizzle];
+      res = bld->inputs[reg->Register.Index][swizzle];
       assert(res);
       break;
 
    case TGSI_FILE_TEMPORARY:
-      {
-         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                              swizzle,
-                                              reg->Register.Indirect,
-                                              addr);
+      if (reg->Register.Indirect) {
+         LLVMValueRef vec_len =
+            lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* index into the const buffer */
+         LLVMValueRef temps_array;
+         LLVMTypeRef float4_ptr_type;
+
+         assert(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
+
+         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
+         index_vec = lp_build_const_int_vec(bld->int_bld.type,
+                                            reg->Register.Index * 4 + swizzle);
+
+         /* index_vec += addr_vec */
+         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
+
+         /* index_vec *= vector_length */
+         index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len);
+
+         /* cast temps_array pointer to float* */
+         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array,
+                                        float4_ptr_type, "");
+
+         /* Gather values from the temporary register array */
+         res = build_gather(bld, temps_array, index_vec);
+      }
+      else {
+         LLVMValueRef temp_ptr;
+         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
          res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
-         if(!res)
+         if (!res)
             return bld->base.undef;
       }
       break;
@@ -824,11 +614,9 @@ emit_fetch(
    case TGSI_UTIL_SIGN_SET:
       /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
-      break;
-
+      /* fall through */
    case TGSI_UTIL_SIGN_TOGGLE:
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      res = lp_build_negate( &bld->base, res );
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -862,10 +650,10 @@ emit_fetch_deriv(
    /* TODO: use interpolation coeffs for inputs */
 
    if(ddx)
-      *ddx = emit_ddx(bld, src);
+      *ddx = lp_build_ddx(&bld->base, src);
 
    if(ddy)
-      *ddy = emit_ddy(bld, src);
+      *ddy = lp_build_ddy(&bld->base, src);
 }
 
 
@@ -907,10 +695,8 @@ emit_fetch_predicate(
        * in the swizzles
        */
       if (!unswizzled[swizzle]) {
-         LLVMValueRef pred_ptr = get_preds_ptr(bld, index, swizzle,
-                                               FALSE, 0);
          value = LLVMBuildLoad(bld->base.builder,
-                               pred_ptr, "");
+                               bld->preds[index][swizzle], "");
 
          /*
           * Convert the value to an integer mask.
@@ -972,10 +758,14 @@ emit_store(
    }
 
    if (reg->Register.Indirect) {
+      /* XXX use get_indirect_offsets() here eventually */
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
       unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
+
+      assert(bld->indirect_files);
+
       addr = LLVMBuildLoad(bld->base.builder,
-                           get_addr_ptr(bld, reg->Indirect.Index, swizzle, FALSE, 0),
+                           bld->addr[reg->Indirect.Index][swizzle],
                            "");
       /* for indexing we want integers */
       addr = LLVMBuildFPToSI(bld->base.builder, addr,
@@ -983,35 +773,38 @@ emit_store(
       addr = LLVMBuildExtractElement(bld->base.builder,
                                      addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
                                      "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      addr = LLVMBuildMul(bld->base.builder,
+                          addr, LLVMConstInt(LLVMInt32Type(), 4, 0),
+                          "");
    }
 
    switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         get_output_ptr(bld, reg->Register.Index, chan_index,
-                                        FALSE, 0));
+                         bld->outputs[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_TEMPORARY: {
-      LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                           chan_index,
-                                           reg->Register.Indirect,
-                                           addr);
-      lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+   case TGSI_FILE_TEMPORARY:
+      if (reg->Register.Indirect) {
+         /* XXX not done yet */
+         debug_printf("WARNING: LLVM scatter store of temp regs"
+                      " not implemented\n");
+      }
+      else {
+         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
+                                              chan_index);
+         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+      }
       break;
-   }
 
    case TGSI_FILE_ADDRESS:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         get_addr_ptr(bld, reg->Indirect.Index, chan_index,
-                                      FALSE, 0));
+                         bld->addr[reg->Indirect.Index][chan_index]);
       break;
 
    case TGSI_FILE_PREDICATE:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         get_preds_ptr(bld, index, chan_index,
-                                       FALSE, 0));
+                         bld->preds[reg->Register.Index][chan_index]);
       break;
 
    default:
@@ -1110,8 +903,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       unit = inst->Src[3].Register.Index;
    }  else {
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = emit_ddx( bld, coords[i] );
-         ddy[i] = emit_ddy( bld, coords[i] );
+         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
       }
       unit = inst->Src[1].Register.Index;
    }
@@ -1205,14 +998,6 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
    lp_build_mask_update(bld->mask, mask);
 }
 
-static void
-range_check(struct tgsi_declaration_range *range,
-            unsigned new_first, unsigned new_last)
-{
-   range->First = MIN2(range->First, new_first);
-   range->Last = MAX2(range->Last, new_last);
-}
-
 static void
 emit_declaration(
    struct lp_build_tgsi_soa_context *bld,
@@ -1228,10 +1013,12 @@ emit_declaration(
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
          assert(idx < LP_MAX_TGSI_TEMPS);
-         range_check(&bld->full_range.temps,
-                     first, last);
-         if (!bld->has_indirect_addressing &&
-             !bld->has_function_calls) {
+         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
+                                                   last*4 + 4, 0);
+            bld->temps_array = lp_build_array_alloca(bld->base.builder,
+                                                     vec_type, array_size, "");
+         } else {
             for (i = 0; i < NUM_CHANNELS; i++)
                bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
                                                     vec_type, "");
@@ -1239,40 +1026,23 @@ emit_declaration(
          break;
 
       case TGSI_FILE_OUTPUT:
-         range_check(&bld->full_range.outputs,
-                     first, last);
-         if (!bld->has_function_calls) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
-                                                      vec_type, "");
-         }
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
+                                                   vec_type, "");
          break;
 
       case TGSI_FILE_ADDRESS:
          assert(idx < LP_MAX_TGSI_ADDRS);
-         range_check(&bld->full_range.addrs,
-                     first, last);
-         if (!bld->has_function_calls) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
-                                                   vec_type, "");
-         }
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
+                                                vec_type, "");
          break;
 
       case TGSI_FILE_PREDICATE:
          assert(idx < LP_MAX_TGSI_PREDS);
-         range_check(&bld->full_range.preds,
-                     first, last);
-         if (!bld->has_function_calls) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
-                                                    vec_type, "");
-         }
-         break;
-
-      case TGSI_FILE_INPUT:
-         range_check(&bld->full_range.inputs,
-                     first, last);
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
+                                                 vec_type, "");
          break;
 
       default:
@@ -1291,7 +1061,8 @@ static boolean
 emit_instruction(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
-   const struct tgsi_opcode_info *info)
+   const struct tgsi_opcode_info *info,
+   int *pc)
 {
    unsigned chan_index;
    LLVMValueRef src0, src1, src2;
@@ -1315,6 +1086,8 @@ emit_instruction(
     * redundant code.
     */
 
+   (*pc)++;
+
    assert(info->num_dst <= 1);
    if (info->num_dst) {
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -1912,26 +1685,19 @@ emit_instruction(
       return FALSE;
       break;
 
-   case TGSI_OPCODE_CAL: {
-      LLVMValueRef args[7];
-      LLVMValueRef func = lp_get_function(bld, inst->Label.Label);
-      args[0] = bld->inputs_array;
-      args[1] = bld->outputs_array;
-      args[2] = bld->consts_ptr;
-      args[3] = bld->temps_array;
-      args[4] = bld->addrs_array;
-      args[5] = bld->preds_array;
-      args[6] = bld->immediates_array;
-      LLVMBuildCall(bld->base.builder, func, args, Elements(args), "");
-   }
+   case TGSI_OPCODE_CAL:
+      lp_exec_mask_call(&bld->exec_mask,
+                        inst->Label.Label,
+                        pc);
+
       break;
 
    case TGSI_OPCODE_RET:
-      lp_exec_ret(&bld->exec_mask);
+      lp_exec_mask_ret(&bld->exec_mask, pc);
       break;
 
    case TGSI_OPCODE_END:
-      emit_end(bld);
+      *pc = -1;
       break;
 
    case TGSI_OPCODE_SSG:
@@ -2098,7 +1864,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_BGNSUB:
-      emit_bgnsub(bld);
+      lp_exec_mask_bgnsub(&bld->exec_mask);
       break;
 
    case TGSI_OPCODE_ELSE:
@@ -2114,7 +1880,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_ENDSUB:
-      emit_endsub(bld);
+      lp_exec_mask_endsub(&bld->exec_mask, pc);
       break;
 
    case TGSI_OPCODE_PUSHA:
@@ -2257,7 +2023,9 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    struct lp_build_tgsi_soa_context bld;
    struct tgsi_parse_context parse;
    uint num_immediates = 0;
+   uint num_instructions = 0;
    unsigned i;
+   int pc = 0;
 
    /* Setup build context */
    memset(&bld, 0, sizeof bld);
@@ -2269,10 +2037,14 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
    bld.sampler = sampler;
-   bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 ||
-                                 info->opcode_count[TGSI_OPCODE_ARL] > 0;
-   bld.has_function_calls = info->opcode_count[TGSI_OPCODE_CAL] > 0;
-   bld.func_hash = cso_hash_create();
+   bld.indirect_files = info->indirect_files;
+   bld.instructions = (struct tgsi_full_instruction *)
+                      MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
+   bld.max_instructions = LP_MAX_INSTRUCTIONS;
+
+   if (!bld.instructions) {
+      return;
+   }
 
    lp_exec_mask_init(&bld.exec_mask, &bld.base);
 
@@ -2289,17 +2061,21 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          {
-            unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-            const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(opcode);
-            /* we finished processing declarations, emit anything that needs
-             * to go before the first instruction */
-            if (bld.instno == 0) {
-               emit_preamble(&bld, num_immediates);
+            /* save expanded instruction */
+            if (num_instructions == bld.max_instructions) {
+               bld.instructions = REALLOC(bld.instructions,
+                                          bld.max_instructions
+                                          * sizeof(struct tgsi_full_instruction),
+                                          (bld.max_instructions + LP_MAX_INSTRUCTIONS)
+                                          * sizeof(struct tgsi_full_instruction));
+               bld.max_instructions += LP_MAX_INSTRUCTIONS;
             }
-            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, opcode_info ))
-               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-                             opcode_info->mnemonic);
-            bld.instno++;
+
+            memcpy(bld.instructions + num_instructions,
+                   &parse.FullToken.FullInstruction,
+                   sizeof(bld.instructions[0]));
+
+            num_instructions++;
          }
 
          break;
@@ -2326,20 +2102,33 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          assert( 0 );
       }
    }
-   /* we have to make sure we're at the end of the main block
-    * (which won't be the case if we had more than one TGSI function
-    * in the given shader) to let the calling function append
-    * whatever it needs at the end of the main function */
-   LLVMPositionBuilderAtEnd(bld.base.builder, bld.main_block);
+
+   while (pc != -1) {
+      struct tgsi_full_instruction *instr = bld.instructions + pc;
+      const struct tgsi_opcode_info *opcode_info =
+         tgsi_get_opcode_info(instr->Instruction.Opcode);
+      if (!emit_instruction( &bld, instr, opcode_info, &pc ))
+         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                       opcode_info->mnemonic);
+   }
 
    if (0) {
       LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
       LLVMValueRef function = LLVMGetBasicBlockParent(block);
+      debug_printf("11111111111111111111111111111 \n");
       tgsi_dump(tokens, 0);
       lp_debug_dump_value(function);
+      debug_printf("2222222222222222222222222222 \n");
    }
    tgsi_parse_free( &parse );
 
-   cso_hash_delete(bld.func_hash);
+   if (0) {
+      LLVMModuleRef module = LLVMGetGlobalParent(
+         LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder)));
+      LLVMDumpModule(module);
+
+   }
+
+   FREE( bld.instructions );
 }