X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fstate_tracker%2Fst_glsl_to_tgsi.cpp;h=60a4e2831a415e4337b4359d95c77ece5d82cbfe;hb=bb4c5d72d7c7cb1d9e7016e2c07c36875f30011a;hp=3a69a439822cc2b815ea7ef1e7c96165fe344e8b;hpb=f751730ad003bb19ce85bc4d0abddaf40edde6c1;p=mesa.git

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 3a69a439822..60a4e2831a4 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -42,17 +42,17 @@
 #include "ir_optimization.h"
 #include "ast.h"
 
-extern "C" {
 #include "main/mtypes.h"
-#include "main/shaderapi.h"
 #include "main/shaderobj.h"
-#include "main/uniforms.h"
 #include "program/hash_table.h"
+
+extern "C" {
+#include "main/shaderapi.h"
+#include "main/uniforms.h"
 #include "program/prog_instruction.h"
 #include "program/prog_optimize.h"
 #include "program/prog_print.h"
 #include "program/program.h"
-#include "program/prog_uniform.h"
 #include "program/prog_parameter.h"
 #include "program/sampler.h"
 
@@ -78,8 +78,17 @@ extern "C" {
                            (1 << PROGRAM_CONSTANT) |     \
                            (1 << PROGRAM_UNIFORM))
 
+/**
+ * Maximum number of temporary registers.
+ *
+ * It is too big for stack allocated arrays -- it will cause stack overflow on
+ * Windows and likely Mac OS X.
+ */
 #define MAX_TEMPS         4096
 
+/* will be 4 for GLSL 4.00 */
+#define MAX_GLSL_TEXTURE_OFFSET 1
+
 class st_src_reg;
 class st_dst_reg;
 
@@ -174,7 +183,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->index = reg.index;
    this->swizzle = SWIZZLE_XYZW;
    this->negate = 0;
-   this->reladdr = NULL;
+   this->reladdr = reg.reladdr;
 }
 
 st_dst_reg::st_dst_reg(st_src_reg reg)
@@ -211,6 +220,8 @@ public:
    int sampler; /**< sampler index */
    int tex_target; /**< One of TEXTURE_*_INDEX */
    GLboolean tex_shadow;
+   struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
+   unsigned tex_offset_num_offset;
    int dead_mask; /**< Used in dead code elimination */
 
    class function_entry *function; /* Set on TGSI_OPCODE_CAL or TGSI_OPCODE_BGNSUB */
@@ -229,6 +240,20 @@ public:
    ir_variable *var; /* variable that maps to this, if any */
 };
 
+class immediate_storage : public exec_node {
+public:
+   immediate_storage(gl_constant_value *values, int size, int type)
+   {
+      memcpy(this->values, values, size * sizeof(gl_constant_value));
+      this->size = size;
+      this->type = type;
+   }
+   
+   gl_constant_value values[4];
+   int size; /**< Number of components (1-4) */
+   int type; /**< GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
+};
+
 class function_entry : public exec_node {
 public:
    ir_function_signature *sig;
@@ -272,7 +297,6 @@ public:
    struct gl_program *prog;
    struct gl_shader_program *shader_program;
    struct gl_shader_compiler_options *options;
-   struct gl_program_parameter_list *immediates;
 
    int next_temp;
 
@@ -280,11 +304,16 @@ public:
    int samplers_used;
    bool indirect_addr_temps;
    bool indirect_addr_consts;
+   int num_clip_distances;
    
    int glsl_version;
+   bool native_integers;
 
    variable_storage *find_variable_storage(ir_variable *var);
 
+   int add_constant(gl_register_file file, gl_constant_value values[4],
+                    int size, int datatype, GLuint *swizzle_out);
+
    function_entry *get_function_signature(ir_function_signature *sig);
 
    st_src_reg get_temp(const glsl_type *type);
@@ -326,6 +355,10 @@ public:
    /** List of variable_storage */
    exec_list variables;
 
+   /** List of immediate_storage */
+   exec_list immediates;
+   unsigned num_immediates;
+
    /** List of function_entry */
    exec_list function_signatures;
    int next_signature_id;
@@ -352,11 +385,11 @@ public:
    /**
     * Emit the correct dot-product instruction for the type of arguments
     */
-   void emit_dp(ir_instruction *ir,
-                st_dst_reg dst,
-                st_src_reg src0,
-                st_src_reg src1,
-                unsigned elements);
+   glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
+                                     st_dst_reg dst,
+                                     st_src_reg src0,
+                                     st_src_reg src1,
+                                     unsigned elements);
 
    void emit_scalar(ir_instruction *ir, unsigned op,
         	    st_dst_reg dst, st_src_reg src0);
@@ -364,20 +397,23 @@ public:
    void emit_scalar(ir_instruction *ir, unsigned op,
         	    st_dst_reg dst, st_src_reg src0, st_src_reg src1);
 
+   void try_emit_float_set(ir_instruction *ir, unsigned op, st_dst_reg dst);
+
    void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);
 
    void emit_scs(ir_instruction *ir, unsigned op,
         	 st_dst_reg dst, const st_src_reg &src);
 
-   GLboolean try_emit_mad(ir_expression *ir,
-        		  int mul_operand);
-   GLboolean try_emit_sat(ir_expression *ir);
+   bool try_emit_mad(ir_expression *ir,
+              int mul_operand);
+   bool try_emit_mad_for_and_not(ir_expression *ir,
+              int mul_operand);
+   bool try_emit_sat(ir_expression *ir);
 
    void emit_swz(ir_expression *ir);
 
    bool process_move_condition(ir_rvalue *ir);
 
-   void remove_output_reads(gl_register_file type);
    void simplify_cmp(void);
 
    void rename_temp_register(int index, int new_index);
@@ -489,7 +525,7 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
 
    inst->function = NULL;
    
-   if (op == TGSI_OPCODE_ARL)
+   if (op == TGSI_OPCODE_ARL || op == TGSI_OPCODE_UARL)
       this->num_address_regs = 1;
    
    /* Update indirect addressing status used by TGSI */
@@ -539,7 +575,10 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
    }
 
    this->instructions.push_tail(inst);
-   
+
+   if (native_integers)
+      try_emit_float_set(ir, op, dst);
+
    return inst;
 }
 
@@ -565,11 +604,28 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op)
    return emit(ir, op, undef_dst, undef_src, undef_src, undef_src);
 }
 
+ /**
+ * Emits the code to convert the result of float SET instructions to integers.
+ */
+void
+glsl_to_tgsi_visitor::try_emit_float_set(ir_instruction *ir, unsigned op,
+        		 st_dst_reg dst)
+{
+   if ((op == TGSI_OPCODE_SEQ ||
+        op == TGSI_OPCODE_SNE ||
+        op == TGSI_OPCODE_SGE ||
+        op == TGSI_OPCODE_SLT))
+   {
+      st_src_reg src = st_src_reg(dst);
+      src.negate = ~src.negate;
+      dst.type = GLSL_TYPE_FLOAT;
+      emit(ir, TGSI_OPCODE_F2I, dst, src);
+   }
+}
+
 /**
  * Determines whether to use an integer, unsigned integer, or float opcode 
  * based on the operands and input opcode, then emits the result.
- * 
- * TODO: type checking for remaining TGSI opcodes
  */
 unsigned
 glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
@@ -580,8 +636,8 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
    
    if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
       type = GLSL_TYPE_FLOAT;
-   else if (glsl_version >= 130)
-      type = src0.type;
+   else if (native_integers)
+      type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
 
 #define case4(c, f, i, u) \
    case TGSI_OPCODE_##c: \
@@ -607,12 +663,10 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
       case3(SGE, ISGE, USGE);
       case3(SLT, ISLT, USLT);
       
-      case2iu(SHL, SHL);
       case2iu(ISHR, USHR);
-      case2iu(NOT, NOT);
-      case2iu(AND, AND);
-      case2iu(OR, OR);
-      case2iu(XOR, XOR);
+
+      case2fi(SSG, ISSG);
+      case3(ABS, IABS, IABS);
       
       default: break;
    }
@@ -621,7 +675,7 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
    return op;
 }
 
-void
+glsl_to_tgsi_instruction *
 glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
         		    st_dst_reg dst, st_src_reg src0, st_src_reg src1,
         		    unsigned elements)
@@ -630,7 +684,7 @@ glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
    };
 
-   emit(ir, dot_opcodes[elements - 2], dst, src0, src1);
+   return emit(ir, dot_opcodes[elements - 2], dst, src0, src1);
 }
 
 /**
@@ -701,16 +755,12 @@ void
 glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
         		        st_dst_reg dst, st_src_reg src0)
 {
-   st_src_reg tmp = get_temp(glsl_type::float_type);
+   int op = TGSI_OPCODE_ARL;
 
-   if (src0.type == GLSL_TYPE_INT)
-      emit(NULL, TGSI_OPCODE_I2F, st_dst_reg(tmp), src0);
-   else if (src0.type == GLSL_TYPE_UINT)
-      emit(NULL, TGSI_OPCODE_U2F, st_dst_reg(tmp), src0);
-   else
-      tmp = src0;
-   
-   emit(NULL, TGSI_OPCODE_ARL, dst, tmp);
+   if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT)
+      op = TGSI_OPCODE_UARL;
+
+   emit(NULL, op, dst, src0);
 }
 
 /**
@@ -808,38 +858,71 @@ glsl_to_tgsi_visitor::emit_scs(ir_instruction *ir, unsigned op,
    }
 }
 
-struct st_src_reg
+int
+glsl_to_tgsi_visitor::add_constant(gl_register_file file,
+        		     gl_constant_value values[4], int size, int datatype,
+        		     GLuint *swizzle_out)
+{
+   if (file == PROGRAM_CONSTANT) {
+      return _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
+                                              size, datatype, swizzle_out);
+   } else {
+      int index = 0;
+      immediate_storage *entry;
+      assert(file == PROGRAM_IMMEDIATE);
+
+      /* Search immediate storage to see if we already have an identical
+       * immediate that we can use instead of adding a duplicate entry.
+       */
+      foreach_iter(exec_list_iterator, iter, this->immediates) {
+         entry = (immediate_storage *)iter.get();
+         
+         if (entry->size == size &&
+             entry->type == datatype &&
+             !memcmp(entry->values, values, size * sizeof(gl_constant_value))) {
+             return index;
+         }
+         index++;
+      }
+      
+      /* Add this immediate to the list. */
+      entry = new(mem_ctx) immediate_storage(values, size, datatype);
+      this->immediates.push_tail(entry);
+      this->num_immediates++;
+      return index;
+   }
+}
+
+st_src_reg
 glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
 {
    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
    union gl_constant_value uval;
 
    uval.f = val;
-   src.index = _mesa_add_typed_unnamed_constant(this->immediates, &uval, 1,
-                                                GL_FLOAT, &src.swizzle);
+   src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
 
    return src;
 }
 
-struct st_src_reg
+st_src_reg
 glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
 {
    st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
    union gl_constant_value uval;
    
-   assert(glsl_version >= 130);
+   assert(native_integers);
 
    uval.i = val;
-   src.index = _mesa_add_typed_unnamed_constant(this->immediates, &uval, 1,
-                                                GL_INT, &src.swizzle);
+   src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
 
    return src;
 }
 
-struct st_src_reg
+st_src_reg
 glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
 {
-   if (glsl_version >= 130)
+   if (native_integers)
       return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) : 
                                        st_src_reg_for_int(val);
    else
@@ -896,10 +979,8 @@ st_src_reg
 glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
 {
    st_src_reg src;
-   int swizzle[4];
-   int i;
 
-   src.type = glsl_version >= 130 ? type->base_type : GLSL_TYPE_FLOAT;
+   src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
    src.file = PROGRAM_TEMPORARY;
    src.index = next_temp;
    src.reladdr = NULL;
@@ -908,12 +989,7 @@ glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
    if (type->is_array() || type->is_record()) {
       src.swizzle = SWIZZLE_NOOP;
    } else {
-      for (i = 0; i < type->vector_elements; i++)
-         swizzle[i] = i;
-      for (; i < 4; i++)
-         swizzle[i] = type->vector_elements - 1;
-      src.swizzle = MAKE_SWIZZLE4(swizzle[0], swizzle[1],
-        			  swizzle[2], swizzle[3]);
+      src.swizzle = swizzle_for_size(type->vector_elements);
    }
    src.negate = 0;
 
@@ -944,29 +1020,6 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
 
       fp->OriginUpperLeft = ir->origin_upper_left;
       fp->PixelCenterInteger = ir->pixel_center_integer;
-
-   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
-      struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
-      switch (ir->depth_layout) {
-      case ir_depth_layout_none:
-         fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_NONE;
-         break;
-      case ir_depth_layout_any:
-         fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_ANY;
-         break;
-      case ir_depth_layout_greater:
-         fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_GREATER;
-         break;
-      case ir_depth_layout_less:
-         fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_LESS;
-         break;
-      case ir_depth_layout_unchanged:
-         fp->FragDepthLayout = FRAG_DEPTH_LAYOUT_UNCHANGED;
-         break;
-      default:
-         assert(0);
-         break;
-      }
    }
 
    if (ir->mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
@@ -986,7 +1039,7 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
          }
       }
 
-      struct variable_storage *storage;
+      variable_storage *storage;
       st_dst_reg dst;
       if (i == ir->num_state_slots) {
          /* We'll set the index later. */
@@ -1007,7 +1060,7 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
          this->next_temp += type_size(ir->type);
 
          dst = st_dst_reg(st_src_reg(PROGRAM_TEMPORARY, storage->index,
-               glsl_version >= 130 ? ir->type->base_type : GLSL_TYPE_FLOAT));
+               native_integers ? ir->type->base_type : GLSL_TYPE_FLOAT));
       }
 
 
@@ -1023,7 +1076,7 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
             }
          } else {
             st_src_reg src(PROGRAM_STATE_VAR, index,
-                  glsl_version >= 130 ? ir->type->base_type : GLSL_TYPE_FLOAT);
+                  native_integers ? ir->type->base_type : GLSL_TYPE_FLOAT);
             src.swizzle = slots[i].swizzle;
             emit(ir, TGSI_OPCODE_MOV, dst, src);
             /* even a float takes up a whole vec4 reg in a struct/array. */
@@ -1137,7 +1190,7 @@ glsl_to_tgsi_visitor::visit(ir_function *ir)
    }
 }
 
-GLboolean
+bool
 glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
 {
    int nonmul_operand = 1 - mul_operand;
@@ -1163,7 +1216,47 @@ glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
    return true;
 }
 
-GLboolean
+/**
+ * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
+ *
+ * The logic values are 1.0 for true and 0.0 for false.  Logical-and is
+ * implemented using multiplication, and logical-or is implemented using
+ * addition.  Logical-not can be implemented as (true - x), or (1.0 - x).
+ * As result, the logical expression (a & !b) can be rewritten as:
+ *
+ *     - a * !b
+ *     - a * (1 - b)
+ *     - (a * 1) - (a * b)
+ *     - a + -(a * b)
+ *     - a + (a * -b)
+ *
+ * This final expression can be implemented as a single MAD(a, -b, a)
+ * instruction.
+ */
+bool
+glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operand)
+{
+   const int other_operand = 1 - try_operand;
+   st_src_reg a, b;
+
+   ir_expression *expr = ir->operands[try_operand]->as_expression();
+   if (!expr || expr->operation != ir_unop_logic_not)
+      return false;
+
+   ir->operands[other_operand]->accept(this);
+   a = this->result;
+   expr->operands[0]->accept(this);
+   b = this->result;
+
+   b.negate = ~b.negate;
+
+   this->result = get_temp(ir->type);
+   emit(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
+
+   return true;
+}
+
+bool
 glsl_to_tgsi_visitor::try_emit_sat(ir_expression *ir)
 {
    /* Saturates were only introduced to vertex programs in
@@ -1179,12 +1272,32 @@ glsl_to_tgsi_visitor::try_emit_sat(ir_expression *ir)
    sat_src->accept(this);
    st_src_reg src = this->result;
 
-   this->result = get_temp(ir->type);
-   st_dst_reg result_dst = st_dst_reg(this->result);
-   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
-   glsl_to_tgsi_instruction *inst;
-   inst = emit(ir, TGSI_OPCODE_MOV, result_dst, src);
-   inst->saturate = true;
+   /* If we generated an expression instruction into a temporary in
+    * processing the saturate's operand, apply the saturate to that
+    * instruction.  Otherwise, generate a MOV to do the saturate.
+    *
+    * Note that we have to be careful to only do this optimization if
+    * the instruction in question was what generated src->result.  For
+    * example, ir_dereference_array might generate a MUL instruction
+    * to create the reladdr, and return us a src reg using that
+    * reladdr.  That MUL result is not the value we're trying to
+    * saturate.
+    */
+   ir_expression *sat_src_expr = sat_src->as_expression();
+   if (sat_src_expr && (sat_src_expr->operation == ir_binop_mul ||
+			sat_src_expr->operation == ir_binop_add ||
+			sat_src_expr->operation == ir_binop_dot)) {
+      glsl_to_tgsi_instruction *new_inst;
+      new_inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
+      new_inst->saturate = true;
+   } else {
+      this->result = get_temp(ir->type);
+      st_dst_reg result_dst = st_dst_reg(this->result);
+      result_dst.writemask = (1 << ir->type->vector_elements) - 1;
+      glsl_to_tgsi_instruction *inst;
+      inst = emit(ir, TGSI_OPCODE_MOV, result_dst, src);
+      inst->saturate = true;
+   }
 
    return true;
 }
@@ -1224,6 +1337,16 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       if (try_emit_mad(ir, 0))
          return;
    }
+
+   /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
+    */
+   if (ir->operation == ir_binop_logic_and) {
+      if (try_emit_mad_for_and_not(ir, 1))
+	 return;
+      if (try_emit_mad_for_and_not(ir, 0))
+	 return;
+   }
+
    if (try_emit_sat(ir))
       return;
 
@@ -1269,11 +1392,20 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    switch (ir->operation) {
    case ir_unop_logic_not:
-      emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], st_src_reg_for_type(result_dst.type, 0));
+      if (result_dst.type != GLSL_TYPE_FLOAT)
+         emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
+      else {
+         /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
+          * older GPUs implement SEQ using multiple instructions (i915 uses two
+          * SGE instructions and a MUL instruction).  Since our logic values are
+          * 0.0 and 1.0, 1-x also implements !x.
+          */
+         op[0].negate = ~op[0].negate;
+         emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
+      }
       break;
    case ir_unop_neg:
-      assert(result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_INT);
-      if (result_dst.type == GLSL_TYPE_INT)
+      if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
          emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
       else {
          op[0].negate = ~op[0].negate;
@@ -1281,7 +1413,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
    case ir_unop_abs:
-      assert(result_dst.type == GLSL_TYPE_FLOAT);
       emit(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;
    case ir_unop_sign:
@@ -1360,10 +1491,10 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       emit(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
       break;
    case ir_binop_greater:
-      emit(ir, TGSI_OPCODE_SGT, result_dst, op[0], op[1]);
+      emit(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
       break;
    case ir_binop_lequal:
-      emit(ir, TGSI_OPCODE_SLE, result_dst, op[0], op[1]);
+      emit(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
       break;
    case ir_binop_gequal:
       emit(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
@@ -1378,13 +1509,56 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       /* "==" operator producing a scalar boolean. */
       if (ir->operands[0]->type->is_vector() ||
           ir->operands[1]->type->is_vector()) {
-         st_src_reg temp = get_temp(glsl_version >= 130 ? 
+         st_src_reg temp = get_temp(native_integers ?
                glsl_type::get_instance(ir->operands[0]->type->base_type, 4, 1) :
                glsl_type::vec4_type);
-         assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
-         emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
-         emit_dp(ir, result_dst, temp, temp, vector_elements);
-         emit(ir, TGSI_OPCODE_SEQ, result_dst, result_src, st_src_reg_for_float(0.0));
+         
+         if (native_integers) {
+            st_dst_reg temp_dst = st_dst_reg(temp);
+            st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
+            
+            emit(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
+            
+            /* Emit 1-3 AND operations to combine the SEQ results. */
+            switch (ir->operands[0]->type->vector_elements) {
+            case 2:
+               break;
+            case 3:
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_YYYY;
+               temp2.swizzle = SWIZZLE_ZZZZ;
+               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               break;
+            case 4:
+               temp_dst.writemask = WRITEMASK_X;
+               temp1.swizzle = SWIZZLE_XXXX;
+               temp2.swizzle = SWIZZLE_YYYY;
+               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_ZZZZ;
+               temp2.swizzle = SWIZZLE_WWWW;
+               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+            }
+            
+            temp1.swizzle = SWIZZLE_XXXX;
+            temp2.swizzle = SWIZZLE_YYYY;
+            emit(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
+         } else {
+            emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
+            
+            /* After the dot-product, the value will be an integer on the
+             * range [0,4].  Zero becomes 1.0, and positive values become zero.
+             */
+            emit_dp(ir, result_dst, temp, temp, vector_elements);
+
+            /* Negating the result of the dot-product gives values on the range
+             * [-4, 0].  Zero becomes 1.0, and negative values become zero.
+             * This is achieved using SGE.
+             */
+            st_src_reg sge_src = result_src;
+            sge_src.negate = ~sge_src.negate;
+            emit(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
+         }
       } else {
          emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
       }
@@ -1393,38 +1567,143 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       /* "!=" operator producing a scalar boolean. */
       if (ir->operands[0]->type->is_vector() ||
           ir->operands[1]->type->is_vector()) {
-         st_src_reg temp = get_temp(glsl_version >= 130 ? 
+         st_src_reg temp = get_temp(native_integers ?
                glsl_type::get_instance(ir->operands[0]->type->base_type, 4, 1) :
                glsl_type::vec4_type);
-         assert(ir->operands[0]->type->base_type == GLSL_TYPE_FLOAT);
          emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
-         emit_dp(ir, result_dst, temp, temp, vector_elements);
-         emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_float(0.0));
+
+         if (native_integers) {
+            st_dst_reg temp_dst = st_dst_reg(temp);
+            st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
+            
+            /* Emit 1-3 OR operations to combine the SNE results. */
+            switch (ir->operands[0]->type->vector_elements) {
+            case 2:
+               break;
+            case 3:
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_YYYY;
+               temp2.swizzle = SWIZZLE_ZZZZ;
+               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               break;
+            case 4:
+               temp_dst.writemask = WRITEMASK_X;
+               temp1.swizzle = SWIZZLE_XXXX;
+               temp2.swizzle = SWIZZLE_YYYY;
+               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               temp_dst.writemask = WRITEMASK_Y;
+               temp1.swizzle = SWIZZLE_ZZZZ;
+               temp2.swizzle = SWIZZLE_WWWW;
+               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+            }
+            
+            temp1.swizzle = SWIZZLE_XXXX;
+            temp2.swizzle = SWIZZLE_YYYY;
+            emit(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
+         } else {
+            /* After the dot-product, the value will be an integer on the
+             * range [0,4].  Zero stays zero, and positive values become 1.0.
+             */
+            glsl_to_tgsi_instruction *const dp =
+                  emit_dp(ir, result_dst, temp, temp, vector_elements);
+            if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+               /* The clamping to [0,1] can be done for free in the fragment
+                * shader with a saturate.
+                */
+               dp->saturate = true;
+            } else {
+               /* Negating the result of the dot-product gives values on the range
+                * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
+                * achieved using SLT.
+                */
+               st_src_reg slt_src = result_src;
+               slt_src.negate = ~slt_src.negate;
+               emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+            }
+         }
       } else {
          emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       }
       break;
 
-   case ir_unop_any:
+   case ir_unop_any: {
       assert(ir->operands[0]->type->is_vector());
-      emit_dp(ir, result_dst, op[0], op[0],
-              ir->operands[0]->type->vector_elements);
-      emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_float(0.0));
+
+      /* After the dot-product, the value will be an integer on the
+       * range [0,4].  Zero stays zero, and positive values become 1.0.
+       */
+      glsl_to_tgsi_instruction *const dp =
+         emit_dp(ir, result_dst, op[0], op[0],
+                 ir->operands[0]->type->vector_elements);
+      if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
+          result_dst.type == GLSL_TYPE_FLOAT) {
+	      /* The clamping to [0,1] can be done for free in the fragment
+	       * shader with a saturate.
+	       */
+	      dp->saturate = true;
+      } else if (result_dst.type == GLSL_TYPE_FLOAT) {
+	      /* Negating the result of the dot-product gives values on the range
+	       * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
+	       * is achieved using SLT.
+	       */
+	      st_src_reg slt_src = result_src;
+	      slt_src.negate = ~slt_src.negate;
+	      emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+      }
+      else {
+         /* Use SNE 0 if integers are being used as boolean values. */
+         emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
+      }
       break;
+   }
 
    case ir_binop_logic_xor:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+      if (native_integers)
+         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
+      else
+         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       break;
 
-   case ir_binop_logic_or:
-      /* This could be a saturated add and skip the SNE. */
-      emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
-      emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_float(0.0));
+   case ir_binop_logic_or: {
+      if (native_integers) {
+         /* If integers are used as booleans, we can use an actual "or" 
+          * instruction.
+          */
+         assert(native_integers);
+         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
+      } else {
+         /* After the addition, the value will be an integer on the
+          * range [0,2].  Zero stays zero, and positive values become 1.0.
+          */
+         glsl_to_tgsi_instruction *add =
+            emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
+         if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+            /* The clamping to [0,1] can be done for free in the fragment
+             * shader with a saturate if floats are being used as boolean values.
+             */
+            add->saturate = true;
+         } else {
+            /* Negating the result of the addition gives values on the range
+             * [-2, 0].  Zero stays zero, and negative values become 1.0.  This
+             * is achieved using SLT.
+             */
+            st_src_reg slt_src = result_src;
+            slt_src.negate = ~slt_src.negate;
+            emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+         }
+      }
       break;
+   }
 
    case ir_binop_logic_and:
-      /* the bool args are stored as float 0.0 or 1.0, so "mul" gives us "and". */
-      emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
+      /* If native integers are disabled, the bool args are stored as float 0.0
+       * or 1.0, so "mul" gives us "and".  If they're enabled, just use the
+       * actual AND opcode.
+       */
+      if (native_integers)
+         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
+      else
+         emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_dot:
@@ -1447,37 +1726,63 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
       break;
    case ir_unop_i2f:
-   case ir_unop_b2f:
-      if (glsl_version >= 130) {
+      if (native_integers) {
          emit(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
          break;
       }
-   case ir_unop_b2i:
-      /* Booleans are stored as integers (or floats in GLSL 1.20 and lower). */
+      /* fallthrough to next case otherwise */
+   case ir_unop_b2f:
+      if (native_integers) {
+         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
+         break;
+      }
+      /* fallthrough to next case otherwise */
+   case ir_unop_i2u:
+   case ir_unop_u2i:
+      /* Converting between signed and unsigned integers is a no-op. */
       result_src = op[0];
       break;
+   case ir_unop_b2i:
+      if (native_integers) {
+         /* Booleans are stored as integers using ~0 for true and 0 for false.
+          * GLSL requires that int(bool) return 1 for true and 0 for false.
+          * This conversion is done with AND, but it could be done with NEG.
+          */
+         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
+      } else {
+         /* Booleans and integers are both stored as floats when native 
+          * integers are disabled.
+          */
+         result_src = op[0];
+      }
+      break;
    case ir_unop_f2i:
-      if (glsl_version >= 130)
+      if (native_integers)
          emit(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
       else
          emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_f2b:
+      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
+      break;
    case ir_unop_i2b:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], 
-            st_src_reg_for_type(result_dst.type, 0));
+      if (native_integers)
+         emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
+      else
+         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
       break;
    case ir_unop_trunc:
       emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_ceil:
-      op[0].negate = ~op[0].negate;
-      emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
-      result_src.negate = ~result_src.negate;
+      emit(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
       break;
    case ir_unop_floor:
       emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
       break;
+   case ir_unop_round_even:
+      emit(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
+      break;
    case ir_unop_fract:
       emit(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
       break;
@@ -1493,41 +1798,41 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
 
    case ir_unop_bit_not:
-      if (glsl_version >= 130) {
+      if (native_integers) {
          emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
          break;
       }
    case ir_unop_u2f:
-      if (glsl_version >= 130) {
+      if (native_integers) {
          emit(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
          break;
       }
    case ir_binop_lshift:
-      if (glsl_version >= 130) {
-         emit(ir, TGSI_OPCODE_SHL, result_dst, op[0]);
+      if (native_integers) {
+         emit(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_rshift:
-      if (glsl_version >= 130) {
-         emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0]);
+      if (native_integers) {
+         emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_and:
-      if (glsl_version >= 130) {
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0]);
+      if (native_integers) {
+         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_xor:
-      if (glsl_version >= 130) {
-         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0]);
+      if (native_integers) {
+         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_or:
-      if (glsl_version >= 130) {
-         emit(ir, TGSI_OPCODE_OR, result_dst, op[0]);
+      if (native_integers) {
+         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
          break;
       }
-   case ir_unop_round_even:
+
       assert(!"GLSL 1.30 features unsupported");
       break;
 
@@ -1613,20 +1918,12 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_INPUT,
                                                var->location);
-         if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
-             var->location >= VERT_ATTRIB_GENERIC0) {
-            _mesa_add_attribute(this->prog->Attributes,
-                                var->name,
-                                _mesa_sizeof_glsl_type(var->type->gl_type),
-                                var->type->gl_type,
-                                var->location - VERT_ATTRIB_GENERIC0);
-         }
          break;
       case ir_var_out:
          assert(var->location != -1);
          entry = new(mem_ctx) variable_storage(var,
                                                PROGRAM_OUTPUT,
-                                               var->location);
+                                               var->location + var->index);
          break;
       case ir_var_system_value:
          entry = new(mem_ctx) variable_storage(var,
@@ -1650,7 +1947,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
    }
 
    this->result = st_src_reg(entry->file, entry->index, var->type);
-   if (glsl_version <= 120)
+   if (!native_integers)
       this->result.type = GLSL_TYPE_FLOAT;
 }
 
@@ -1669,7 +1966,6 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
    if (index) {
       src.index += index->value.i[0] * element_size;
    } else {
-      st_src_reg array_base = this->result;
       /* Variable index array dereference.  It eats the "vec4" of the
        * base of the array and an index that offsets the TGSI register
        * index.
@@ -1681,10 +1977,24 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
       if (element_size == 1) {
          index_reg = this->result;
       } else {
-         index_reg = get_temp(glsl_type::float_type);
+         index_reg = get_temp(native_integers ?
+                              glsl_type::int_type : glsl_type::float_type);
 
          emit(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
-              this->result, st_src_reg_for_float(element_size));
+              this->result, st_src_reg_for_type(index_reg.type, element_size));
+      }
+
+      /* If there was already a relative address register involved, add the
+       * new and the old together to get the new offset.
+       */
+      if (src.reladdr != NULL) {
+         st_src_reg accum_reg = get_temp(native_integers ?
+                                glsl_type::int_type : glsl_type::float_type);
+
+         emit(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
+              index_reg, *src.reladdr);
+
+         index_reg = accum_reg;
       }
 
       src.reladdr = ralloc(mem_ctx, st_src_reg);
@@ -1900,12 +2210,25 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
 
       for (i = 0; i < type_size(ir->lhs->type); i++) {
          st_src_reg l_src = st_src_reg(l);
+         st_src_reg condition_temp = condition;
          l_src.swizzle = swizzle_for_size(ir->lhs->type->vector_elements);
          
+         if (native_integers) {
+            /* This is necessary because TGSI's CMP instruction expects the
+             * condition to be a float, and we store booleans as integers.
+             * If TGSI had a UCMP instruction or similar, this extra
+             * instruction would not be necessary.
+             */
+            condition_temp = get_temp(glsl_type::vec4_type);
+            condition.negate = 0;
+            emit(ir, TGSI_OPCODE_I2F, st_dst_reg(condition_temp), condition);
+            condition_temp.swizzle = condition.swizzle;
+         }
+         
          if (switch_order) {
-            emit(ir, TGSI_OPCODE_CMP, l, condition, l_src, r);
+            emit(ir, TGSI_OPCODE_CMP, l, condition_temp, l_src, r);
          } else {
-            emit(ir, TGSI_OPCODE_CMP, l, condition, r, l_src);
+            emit(ir, TGSI_OPCODE_CMP, l, condition_temp, r, l_src);
          }
 
          l.index++;
@@ -1914,15 +2237,18 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
    } else if (ir->rhs->as_expression() &&
               this->instructions.get_tail() &&
               ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
-              type_size(ir->lhs->type) == 1) {
+              type_size(ir->lhs->type) == 1 &&
+              l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst.writemask) {
       /* To avoid emitting an extra MOV when assigning an expression to a 
        * variable, emit the last instruction of the expression again, but
        * replace the destination register with the target of the assignment.
        * Dead code elimination will remove the original instruction.
        */
-      glsl_to_tgsi_instruction *inst;
+      glsl_to_tgsi_instruction *inst, *new_inst;
       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
-      emit(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
+      new_inst = emit(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
+      new_inst->saturate = inst->saturate;
+      inst->dead_mask = inst->dst.writemask;
    } else {
       for (i = 0; i < type_size(ir->lhs->type); i++) {
          emit(ir, TGSI_OPCODE_MOV, l, r);
@@ -1941,12 +2267,8 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
    gl_constant_value *values = (gl_constant_value *) stack_vals;
    GLenum gl_type = GL_NONE;
    unsigned int i;
-   gl_register_file file;
-   gl_program_parameter_list *param_list;
    static int in_array = 0;
-
-   file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
-   param_list = in_array ? this->prog->Parameters : this->immediates;
+   gl_register_file file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
 
    /* Unfortunately, 4 floats is all we can get into
     * _mesa_add_typed_unnamed_constant.  So, make a temp to store an
@@ -2009,11 +2331,11 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
          values = (gl_constant_value *) &ir->value.f[i * ir->type->vector_elements];
 
          src = st_src_reg(file, -1, ir->type->base_type);
-         src.index = _mesa_add_typed_unnamed_constant(param_list,
-                                                      values,
-                                                      ir->type->vector_elements,
-                                                      GL_FLOAT,
-                                                      &src.swizzle);
+         src.index = add_constant(file,
+                                  values,
+                                  ir->type->vector_elements,
+                                  GL_FLOAT,
+                                  &src.swizzle);
          emit(ir, TGSI_OPCODE_MOV, mat_column, src);
 
          mat_column.index++;
@@ -2031,28 +2353,28 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
       }
       break;
    case GLSL_TYPE_UINT:
-      gl_type = glsl_version >= 130 ? GL_UNSIGNED_INT : GL_FLOAT;
+      gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
       for (i = 0; i < ir->type->vector_elements; i++) {
-         if (glsl_version >= 130)
+         if (native_integers)
             values[i].u = ir->value.u[i];
          else
             values[i].f = ir->value.u[i];
       }
       break;
    case GLSL_TYPE_INT:
-      gl_type = glsl_version >= 130 ? GL_INT : GL_FLOAT;
+      gl_type = native_integers ? GL_INT : GL_FLOAT;
       for (i = 0; i < ir->type->vector_elements; i++) {
-         if (glsl_version >= 130)
+         if (native_integers)
             values[i].i = ir->value.i[i];
          else
             values[i].f = ir->value.i[i];
       }
       break;
    case GLSL_TYPE_BOOL:
-      gl_type = glsl_version >= 130 ? GL_BOOL : GL_FLOAT;
+      gl_type = native_integers ? GL_BOOL : GL_FLOAT;
       for (i = 0; i < ir->type->vector_elements; i++) {
-         if (glsl_version >= 130)
-            values[i].b = ir->value.b[i];
+         if (native_integers)
+            values[i].u = ir->value.b[i] ? ~0 : 0;
          else
             values[i].f = ir->value.b[i];
       }
@@ -2062,9 +2384,11 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
    }
 
    this->result = st_src_reg(file, -1, ir->type);
-   this->result.index = _mesa_add_typed_unnamed_constant(param_list,
-        					   values, ir->type->vector_elements, gl_type,
-        					   &this->result.swizzle);
+   this->result.index = add_constant(file,
+                                     values,
+                                     ir->type->vector_elements,
+                                     gl_type,
+                                     &this->result.swizzle);
 }
 
 function_entry *
@@ -2113,7 +2437,7 @@ void
 glsl_to_tgsi_visitor::visit(ir_call *ir)
 {
    glsl_to_tgsi_instruction *call_inst;
-   ir_function_signature *sig = ir->get_callee();
+   ir_function_signature *sig = ir->callee;
    function_entry *entry = get_function_signature(sig);
    int i;
 
@@ -2192,21 +2516,23 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_texture *ir)
 {
-   st_src_reg result_src, coord, lod_info, projector, dx, dy;
+   st_src_reg result_src, coord, lod_info, projector, dx, dy, offset;
    st_dst_reg result_dst, coord_dst;
    glsl_to_tgsi_instruction *inst = NULL;
    unsigned opcode = TGSI_OPCODE_NOP;
 
-   ir->coordinate->accept(this);
+   if (ir->coordinate) {
+      ir->coordinate->accept(this);
 
-   /* Put our coords in a temp.  We'll need to modify them for shadow,
-    * projection, or LOD, so the only case we'd use it as is is if
-    * we're doing plain old texturing.  The optimization passes on
-    * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
-    */
-   coord = get_temp(glsl_type::vec4_type);
-   coord_dst = st_dst_reg(coord);
-   emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
+      /* Put our coords in a temp.  We'll need to modify them for shadow,
+       * projection, or LOD, so the only case we'd use it as is is if
+       * we're doing plain old texturing.  The optimization passes on
+       * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
+       */
+      coord = get_temp(glsl_type::vec4_type);
+      coord_dst = st_dst_reg(coord);
+      emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
+   }
 
    if (ir->projector) {
       ir->projector->accept(this);
@@ -2240,11 +2566,24 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       ir->lod_info.grad.dPdy->accept(this);
       dy = this->result;
       break;
-   case ir_txf: /* TODO: use TGSI_OPCODE_TXF here */
-      assert(!"GLSL 1.30 features unsupported");
+   case ir_txs:
+      opcode = TGSI_OPCODE_TXQ;
+      ir->lod_info.lod->accept(this);
+      lod_info = this->result;
+      break;
+   case ir_txf:
+      opcode = TGSI_OPCODE_TXF;
+      ir->lod_info.lod->accept(this);
+      lod_info = this->result;
+      if (ir->offset) {
+	 ir->offset->accept(this);
+	 offset = this->result;
+      }
       break;
    }
 
+   const glsl_type *sampler_type = ir->sampler->type;
+
    if (ir->projector) {
       if (opcode == TGSI_OPCODE_TEX) {
          /* Slot the projector in as the last component of the coord. */
@@ -2276,6 +2615,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
             tmp_src = get_temp(glsl_type::vec4_type);
             st_dst_reg tmp_dst = st_dst_reg(tmp_src);
 
+	    /* Projective division not allowed for array samplers. */
+	    assert(!sampler_type->sampler_array);
+
             tmp_dst.writemask = WRITEMASK_Z;
             emit(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
 
@@ -2300,12 +2642,22 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
        * coord.
        */
       ir->shadow_comparitor->accept(this);
-      coord_dst.writemask = WRITEMASK_Z;
+
+      /* XXX This will need to be updated for cubemap array samplers. */
+      if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_2D &&
+	   sampler_type->sampler_array) ||
+	  sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
+         coord_dst.writemask = WRITEMASK_W;
+      } else {
+         coord_dst.writemask = WRITEMASK_Z;
+      }
+
       emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
       coord_dst.writemask = WRITEMASK_XYZW;
    }
 
-   if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB) {
+   if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
+       opcode == TGSI_OPCODE_TXF) {
       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
       coord_dst.writemask = WRITEMASK_W;
       emit(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
@@ -2314,7 +2666,11 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
 
    if (opcode == TGSI_OPCODE_TXD)
       inst = emit(ir, opcode, result_dst, coord, dx, dy);
-   else
+   else if (opcode == TGSI_OPCODE_TXQ)
+      inst = emit(ir, opcode, result_dst, lod_info);
+   else if (opcode == TGSI_OPCODE_TXF) {
+      inst = emit(ir, opcode, result_dst, coord);
+   } else
       inst = emit(ir, opcode, result_dst, coord);
 
    if (ir->shadow_comparitor)
@@ -2324,7 +2680,14 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
         					   this->shader_program,
         					   this->prog);
 
-   const glsl_type *sampler_type = ir->sampler->type;
+   if (ir->offset) {
+       inst->tex_offset_num_offset = 1;
+       inst->tex_offsets[0].Index = offset.index;
+       inst->tex_offsets[0].File = offset.file;
+       inst->tex_offsets[0].SwizzleX = GET_SWZ(offset.swizzle, 0);
+       inst->tex_offsets[0].SwizzleY = GET_SWZ(offset.swizzle, 1);
+       inst->tex_offsets[0].SwizzleZ = GET_SWZ(offset.swizzle, 2);
+   }
 
    switch (sampler_type->sampler_dimensionality) {
    case GLSL_SAMPLER_DIM_1D:
@@ -2347,6 +2710,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
    case GLSL_SAMPLER_DIM_BUF:
       assert(!"FINISHME: Implement ARB_texture_buffer_object");
       break;
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      inst->tex_target = TEXTURE_EXTERNAL_INDEX;
+      break;
    default:
       assert(!"Should not get here.");
    }
@@ -2397,7 +2763,7 @@ glsl_to_tgsi_visitor::visit(ir_discard *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_if *ir)
 {
-   glsl_to_tgsi_instruction *cond_inst, *if_inst, *else_inst = NULL;
+   glsl_to_tgsi_instruction *cond_inst, *if_inst;
    glsl_to_tgsi_instruction *prev_inst;
 
    prev_inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
@@ -2429,7 +2795,7 @@ glsl_to_tgsi_visitor::visit(ir_if *ir)
    visit_exec_list(&ir->then_instructions, this);
 
    if (!ir->else_instructions.is_empty()) {
-      else_inst = emit(ir->condition, TGSI_OPCODE_ELSE);
+      emit(ir->condition, TGSI_OPCODE_ELSE);
       visit_exec_list(&ir->else_instructions, this);
    }
 
@@ -2441,17 +2807,20 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
    result.file = PROGRAM_UNDEFINED;
    next_temp = 1;
    next_signature_id = 1;
+   num_immediates = 0;
    current_function = NULL;
    num_address_regs = 0;
    indirect_addr_temps = false;
    indirect_addr_consts = false;
-   immediates = _mesa_new_parameter_list();
    mem_ctx = ralloc_context(NULL);
+   ctx = NULL;
+   prog = NULL;
+   shader_program = NULL;
+   options = NULL;
 }
 
 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
 {
-   _mesa_free_parameter_list(immediates);
    ralloc_free(mem_ctx);
 }
 
@@ -2476,8 +2845,6 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
       if (is_tex_instruction(inst->op)) {
          v->samplers_used |= 1 << inst->sampler;
 
-         prog->SamplerTargets[inst->sampler] =
-            (gl_texture_index)inst->tex_target;
          if (inst->tex_shadow) {
             prog->ShadowSamplers |= 1 << inst->sampler;
          }
@@ -2485,172 +2852,9 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
    }
    
    prog->SamplersUsed = v->samplers_used;
-   _mesa_update_shader_textures_used(prog);
-}
-
 
-/**
- * Check if the given vertex/fragment/shader program is within the
- * resource limits of the context (number of texture units, etc).
- * If any of those checks fail, record a linker error.
- *
- * XXX more checks are needed...
- */
-static void
-check_resources(const struct gl_context *ctx,
-                struct gl_shader_program *shader_program,
-                glsl_to_tgsi_visitor *prog,
-                struct gl_program *proginfo)
-{
-   switch (proginfo->Target) {
-   case GL_VERTEX_PROGRAM_ARB:
-      if (_mesa_bitcount(prog->samplers_used) >
-          ctx->Const.MaxVertexTextureImageUnits) {
-         fail_link(shader_program, "Too many vertex shader texture samplers");
-      }
-      if (proginfo->Parameters->NumParameters > MAX_UNIFORMS) {
-         fail_link(shader_program, "Too many vertex shader constants");
-      }
-      break;
-   case MESA_GEOMETRY_PROGRAM:
-      if (_mesa_bitcount(prog->samplers_used) >
-          ctx->Const.MaxGeometryTextureImageUnits) {
-         fail_link(shader_program, "Too many geometry shader texture samplers");
-      }
-      if (proginfo->Parameters->NumParameters >
-          MAX_GEOMETRY_UNIFORM_COMPONENTS / 4) {
-         fail_link(shader_program, "Too many geometry shader constants");
-      }
-      break;
-   case GL_FRAGMENT_PROGRAM_ARB:
-      if (_mesa_bitcount(prog->samplers_used) >
-          ctx->Const.MaxTextureImageUnits) {
-         fail_link(shader_program, "Too many fragment shader texture samplers");
-      }
-      if (proginfo->Parameters->NumParameters > MAX_UNIFORMS) {
-         fail_link(shader_program, "Too many fragment shader constants");
-      }
-      break;
-   default:
-      _mesa_problem(ctx, "unexpected program type in check_resources()");
-   }
-}
-
-
-
-struct uniform_sort {
-   struct gl_uniform *u;
-   int pos;
-};
-
-/* The shader_program->Uniforms list is almost sorted in increasing
- * uniform->{Frag,Vert}Pos locations, but not quite when there are
- * uniforms shared between targets.  We need to add parameters in
- * increasing order for the targets.
- */
-static int
-sort_uniforms(const void *a, const void *b)
-{
-   struct uniform_sort *u1 = (struct uniform_sort *)a;
-   struct uniform_sort *u2 = (struct uniform_sort *)b;
-
-   return u1->pos - u2->pos;
-}
-
-/* Add the uniforms to the parameters.  The linker chose locations
- * in our parameters lists (which weren't created yet), which the
- * uniforms code will use to poke values into our parameters list
- * when uniforms are updated.
- */
-static void
-add_uniforms_to_parameters_list(struct gl_shader_program *shader_program,
-        			struct gl_shader *shader,
-        			struct gl_program *prog)
-{
-   unsigned int i;
-   unsigned int next_sampler = 0, num_uniforms = 0;
-   struct uniform_sort *sorted_uniforms;
-
-   sorted_uniforms = ralloc_array(NULL, struct uniform_sort,
-        			  shader_program->Uniforms->NumUniforms);
-
-   for (i = 0; i < shader_program->Uniforms->NumUniforms; i++) {
-      struct gl_uniform *uniform = shader_program->Uniforms->Uniforms + i;
-      int parameter_index = -1;
-
-      switch (shader->Type) {
-      case GL_VERTEX_SHADER:
-         parameter_index = uniform->VertPos;
-         break;
-      case GL_FRAGMENT_SHADER:
-         parameter_index = uniform->FragPos;
-         break;
-      case GL_GEOMETRY_SHADER:
-         parameter_index = uniform->GeomPos;
-         break;
-      }
-
-      /* Only add uniforms used in our target. */
-      if (parameter_index != -1) {
-         sorted_uniforms[num_uniforms].pos = parameter_index;
-         sorted_uniforms[num_uniforms].u = uniform;
-         num_uniforms++;
-      }
-   }
-
-   qsort(sorted_uniforms, num_uniforms, sizeof(struct uniform_sort),
-         sort_uniforms);
-
-   for (i = 0; i < num_uniforms; i++) {
-      struct gl_uniform *uniform = sorted_uniforms[i].u;
-      int parameter_index = sorted_uniforms[i].pos;
-      const glsl_type *type = uniform->Type;
-      unsigned int size;
-
-      if (type->is_vector() ||
-          type->is_scalar()) {
-         size = type->vector_elements;
-      } else {
-         size = type_size(type) * 4;
-      }
-
-      gl_register_file file;
-      if (type->is_sampler() ||
-          (type->is_array() && type->fields.array->is_sampler())) {
-         file = PROGRAM_SAMPLER;
-      } else {
-         file = PROGRAM_UNIFORM;
-      }
-
-      GLint index = _mesa_lookup_parameter_index(prog->Parameters, -1,
-        					 uniform->Name);
-
-      if (index < 0) {
-         index = _mesa_add_parameter(prog->Parameters, file,
-        			     uniform->Name, size, type->gl_type,
-        			     NULL, NULL, 0x0);
-
-         /* Sampler uniform values are stored in prog->SamplerUnits,
-          * and the entry in that array is selected by this index we
-          * store in ParameterValues[].
-          */
-         if (file == PROGRAM_SAMPLER) {
-            for (unsigned int j = 0; j < size / 4; j++)
-               prog->Parameters->ParameterValues[index + j][0].f = next_sampler++;
-         }
-
-         /* The location chosen in the Parameters list here (returned
-          * from _mesa_add_uniform) has to match what the linker chose.
-          */
-         if (index != parameter_index) {
-            fail_link(shader_program, "Allocation of uniform `%s' to target "
-        	      "failed (%d vs %d)\n",
-        	      uniform->Name, index, parameter_index);
-         }
-      }
-   }
-
-   ralloc_free(sorted_uniforms);
+   if (v->shader_program != NULL)
+      _mesa_update_shader_textures_used(v->shader_program, prog);
 }
 
 static void
@@ -2714,119 +2918,12 @@ set_uniform_initializer(struct gl_context *ctx, void *mem_ctx,
         		      element_type->matrix_columns,
         		      element_type->vector_elements,
         		      loc, 1, GL_FALSE, (GLfloat *)values);
-         loc += element_type->matrix_columns;
       } else {
          _mesa_uniform(ctx, shader_program, loc, element_type->matrix_columns,
         	       values, element_type->gl_type);
-         loc += type_size(element_type);
       }
-   }
-}
 
-static void
-set_uniform_initializers(struct gl_context *ctx,
-        		 struct gl_shader_program *shader_program)
-{
-   void *mem_ctx = NULL;
-
-   for (unsigned int i = 0; i < MESA_SHADER_TYPES; i++) {
-      struct gl_shader *shader = shader_program->_LinkedShaders[i];
-
-      if (shader == NULL)
-         continue;
-
-      foreach_iter(exec_list_iterator, iter, *shader->ir) {
-         ir_instruction *ir = (ir_instruction *)iter.get();
-         ir_variable *var = ir->as_variable();
-
-         if (!var || var->mode != ir_var_uniform || !var->constant_value)
-            continue;
-
-         if (!mem_ctx)
-            mem_ctx = ralloc_context(NULL);
-
-         set_uniform_initializer(ctx, mem_ctx, shader_program, var->name,
-        			 var->type, var->constant_value);
-      }
-   }
-
-   ralloc_free(mem_ctx);
-}
-
-/*
- * Scan/rewrite program to remove reads of custom (output) registers.
- * The passed type has to be either PROGRAM_OUTPUT or PROGRAM_VARYING
- * (for vertex shaders).
- * In GLSL shaders, varying vars can be read and written.
- * On some hardware, trying to read an output register causes trouble.
- * So, rewrite the program to use a temporary register in this case.
- * 
- * Based on _mesa_remove_output_reads from programopt.c.
- */
-void
-glsl_to_tgsi_visitor::remove_output_reads(gl_register_file type)
-{
-   GLuint i;
-   GLint outputMap[VERT_RESULT_MAX];
-   GLint outputTypes[VERT_RESULT_MAX];
-   GLuint numVaryingReads = 0;
-   GLboolean usedTemps[MAX_TEMPS];
-   GLuint firstTemp = 0;
-
-   _mesa_find_used_registers(prog, PROGRAM_TEMPORARY,
-                             usedTemps, MAX_TEMPS);
-
-   assert(type == PROGRAM_VARYING || type == PROGRAM_OUTPUT);
-   assert(prog->Target == GL_VERTEX_PROGRAM_ARB || type != PROGRAM_VARYING);
-
-   for (i = 0; i < VERT_RESULT_MAX; i++)
-      outputMap[i] = -1;
-
-   /* look for instructions which read from varying vars */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
-      const GLuint numSrc = num_inst_src_regs(inst->op);
-      GLuint j;
-      for (j = 0; j < numSrc; j++) {
-         if (inst->src[j].file == type) {
-            /* replace the read with a temp reg */
-            const GLuint var = inst->src[j].index;
-            if (outputMap[var] == -1) {
-               numVaryingReads++;
-               outputMap[var] = _mesa_find_free_register(usedTemps,
-                                                         MAX_TEMPS,
-                                                         firstTemp);
-               outputTypes[var] = inst->src[j].type;
-               firstTemp = outputMap[var] + 1;
-            }
-            inst->src[j].file = PROGRAM_TEMPORARY;
-            inst->src[j].index = outputMap[var];
-         }
-      }
-   }
-
-   if (numVaryingReads == 0)
-      return; /* nothing to be done */
-
-   /* look for instructions which write to the varying vars identified above */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
-      if (inst->dst.file == type && outputMap[inst->dst.index] >= 0) {
-         /* change inst to write to the temp reg, instead of the varying */
-         inst->dst.file = PROGRAM_TEMPORARY;
-         inst->dst.index = outputMap[inst->dst.index];
-      }
-   }
-   
-   /* insert new MOV instructions at the end */
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (outputMap[i] >= 0) {
-         /* MOV VAR[i], TEMP[tmp]; */
-         st_src_reg src = st_src_reg(PROGRAM_TEMPORARY, outputMap[i], outputTypes[i]);
-         st_dst_reg dst = st_dst_reg(type, WRITEMASK_XYZW, outputTypes[i]);
-         dst.index = i;
-         this->emit(NULL, TGSI_OPCODE_MOV, dst, src);
-      }
+      loc++;
    }
 }
 
@@ -2876,10 +2973,14 @@ get_src_arg_mask(st_dst_reg dst, st_src_reg src)
 void
 glsl_to_tgsi_visitor::simplify_cmp(void)
 {
-   unsigned tempWrites[MAX_TEMPS];
+   unsigned *tempWrites;
    unsigned outputWrites[MAX_PROGRAM_OUTPUTS];
 
-   memset(tempWrites, 0, sizeof(tempWrites));
+   tempWrites = new unsigned[MAX_TEMPS];
+   if (!tempWrites) {
+      return;
+   }
+   memset(tempWrites, 0, sizeof(unsigned) * MAX_TEMPS);
    memset(outputWrites, 0, sizeof(outputWrites));
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -2894,7 +2995,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
           inst->op == TGSI_OPCODE_END ||
           inst->op == TGSI_OPCODE_ENDSUB ||
           inst->op == TGSI_OPCODE_RET) {
-         return;
+         break;
       }
 
       if (inst->dst.file == PROGRAM_OUTPUT) {
@@ -2919,6 +3020,8 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
          inst->src[0] = inst->src[1];
       }
    }
+
+   delete [] tempWrites;
 }
 
 /* Replaces all references to a temporary register index with another index. */
@@ -3326,34 +3429,37 @@ glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void)
       switch (inst->op) {
       case TGSI_OPCODE_BGNLOOP:
       case TGSI_OPCODE_ENDLOOP:
+      case TGSI_OPCODE_CONT:
+      case TGSI_OPCODE_BRK:
          /* End of a basic block, clear the write array entirely.
-          * FIXME: This keeps us from killing dead code when the writes are
+          *
+          * This keeps us from killing dead code when the writes are
           * on either side of a loop, even when the register isn't touched
-          * inside the loop.
+          * inside the loop.  However, glsl_to_tgsi_visitor doesn't seem to emit
+          * dead code of this type, so it shouldn't make a difference as long as
+          * the dead code elimination pass in the GLSL compiler does its job.
           */
          memset(writes, 0, sizeof(*writes) * this->next_temp * 4);
          break;
 
       case TGSI_OPCODE_ENDIF:
-         --level;
-         break;
-
       case TGSI_OPCODE_ELSE:
-         /* Clear all channels written inside the preceding if block from the
-          * write array, but leave those that were not touched.
-          *
-          * FIXME: This destroys opportunities to remove dead code inside of
-          * IF blocks that are followed by an ELSE block.
+         /* Promote the recorded level of all channels written inside the
+          * preceding if or else block to the level above the if/else block.
           */
          for (int r = 0; r < this->next_temp; r++) {
             for (int c = 0; c < 4; c++) {
                if (!writes[4 * r + c])
         	         continue;
 
-               if (write_level[4 * r + c] >= level)
-        	         writes[4 * r + c] = NULL;
+               if (write_level[4 * r + c] == level)
+        	         write_level[4 * r + c] = level-1;
             }
          }
+
+         if(inst->op == TGSI_OPCODE_ENDIF)
+            --level;
+         
          break;
 
       case TGSI_OPCODE_IF:
@@ -3364,7 +3470,7 @@ glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void)
          /* Continuing the block, clear any channels from the write array that
           * are read by this instruction.
           */
-         for (int i = 0; i < 4; i++) {
+         for (unsigned i = 0; i < Elements(inst->src); i++) {
             if (inst->src[i].file == PROGRAM_TEMPORARY && inst->src[i].reladdr){
                /* Any temporary might be read, so no dead code elimination 
                 * across this instruction.
@@ -3426,7 +3532,7 @@ glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void)
       
       if (!inst->dead_mask || !inst->dst.writemask)
          continue;
-      else if (inst->dead_mask == inst->dst.writemask) {
+      else if ((inst->dst.writemask & ~inst->dead_mask) == 0) {
          iter.remove();
          delete inst;
          removed++;
@@ -3531,15 +3637,17 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
    /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
    v->ctx = original->ctx;
    v->prog = prog;
+   v->shader_program = NULL;
    v->glsl_version = original->glsl_version;
+   v->native_integers = original->native_integers;
    v->options = original->options;
    v->next_temp = original->next_temp;
    v->num_address_regs = original->num_address_regs;
    v->samplers_used = prog->SamplersUsed = original->samplers_used;
    v->indirect_addr_temps = original->indirect_addr_temps;
    v->indirect_addr_consts = original->indirect_addr_consts;
-   _mesa_free_parameter_list(v->immediates);
-   v->immediates = _mesa_clone_parameter_list(original->immediates);
+   memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
+   v->num_immediates = original->num_immediates;
 
    /*
     * Get initial pixel color from the texture.
@@ -3552,7 +3660,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
    inst->sampler = 0;
    inst->tex_target = TEXTURE_2D_INDEX;
 
-   prog->InputsRead |= (1 << FRAG_ATTRIB_TEX0);
+   prog->InputsRead |= FRAG_BIT_TEX0;
    prog->SamplersUsed |= (1 << 0); /* mark sampler 0 as used */
    v->samplers_used |= (1 << 0);
 
@@ -3609,6 +3717,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
     * new visitor. */
    foreach_iter(exec_list_iterator, iter, original->instructions) {
       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
+      glsl_to_tgsi_instruction *newinst;
       st_src_reg src_regs[3];
 
       if (inst->dst.file == PROGRAM_OUTPUT)
@@ -3623,17 +3732,16 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
             src_regs[i].index = src0.index;
          }
          else if (src_regs[i].file == PROGRAM_INPUT)
-            prog->InputsRead |= (1 << src_regs[i].index);
+            prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst->tex_target = inst->tex_target;
    }
 
    /* Make modifications to fragment program info. */
    prog->Parameters = _mesa_combine_parameter_lists(params,
                                                     original->prog->Parameters);
-   prog->Attributes = _mesa_clone_parameter_list(original->prog->Attributes);
-   prog->Varying = _mesa_clone_parameter_list(original->prog->Varying);
    _mesa_free_parameter_list(params);
    count_resources(v, prog);
    fp->glsl_to_tgsi = v;
@@ -3660,15 +3768,17 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */
    v->ctx = original->ctx;
    v->prog = prog;
+   v->shader_program = NULL;
    v->glsl_version = original->glsl_version;
+   v->native_integers = original->native_integers;
    v->options = original->options;
    v->next_temp = original->next_temp;
    v->num_address_regs = original->num_address_regs;
    v->samplers_used = prog->SamplersUsed = original->samplers_used;
    v->indirect_addr_temps = original->indirect_addr_temps;
    v->indirect_addr_consts = original->indirect_addr_consts;
-   _mesa_free_parameter_list(v->immediates);
-   v->immediates = _mesa_clone_parameter_list(original->immediates);
+   memcpy(&v->immediates, &original->immediates, sizeof(v->immediates));
+   v->num_immediates = original->num_immediates;
 
    /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
    coord = st_src_reg(PROGRAM_INPUT, FRAG_ATTRIB_TEX0, glsl_type::vec2_type);
@@ -3678,7 +3788,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    inst->sampler = samplerIndex;
    inst->tex_target = TEXTURE_2D_INDEX;
 
-   prog->InputsRead |= (1 << FRAG_ATTRIB_TEX0);
+   prog->InputsRead |= FRAG_BIT_TEX0;
    prog->SamplersUsed |= (1 << samplerIndex); /* mark sampler as used */
    v->samplers_used |= (1 << samplerIndex);
 
@@ -3692,6 +3802,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
     * new visitor. */
    foreach_iter(exec_list_iterator, iter, original->instructions) {
       glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *)iter.get();
+      glsl_to_tgsi_instruction *newinst;
       st_src_reg src_regs[3];
 
       if (inst->dst.file == PROGRAM_OUTPUT)
@@ -3700,16 +3811,15 @@ get_bitmap_visitor(struct st_fragment_program *fp,
       for (int i=0; i<3; i++) {
          src_regs[i] = inst->src[i];
          if (src_regs[i].file == PROGRAM_INPUT)
-            prog->InputsRead |= (1 << src_regs[i].index);
+            prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit(NULL, inst->op, inst->dst, src_regs[0], src_regs[1], src_regs[2]);
+      newinst->tex_target = inst->tex_target;
    }
 
    /* Make modifications to fragment program info. */
    prog->Parameters = _mesa_clone_parameter_list(original->prog->Parameters);
-   prog->Attributes = _mesa_clone_parameter_list(original->prog->Attributes);
-   prog->Varying = _mesa_clone_parameter_list(original->prog->Varying);
    count_resources(v, prog);
    fp->glsl_to_tgsi = v;
 }
@@ -3735,12 +3845,6 @@ struct st_translate {
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
 
-   /* Extra info for handling point size clamping in vertex shader */
-   struct ureg_dst pointSizeResult; /**< Actual point size output register */
-   struct ureg_src pointSizeConst;  /**< Point size range constant register */
-   GLint pointSizeOutIndex;         /**< Temp point size output register */
-   GLboolean prevInstWrotePointSize;
-
    const GLuint *inputMapping;
    const GLuint *outputMapping;
 
@@ -3768,6 +3872,7 @@ struct st_translate {
 /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
 static unsigned mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_FACE,
+   TGSI_SEMANTIC_VERTEXID,
    TGSI_SEMANTIC_INSTANCEID
 };
 
@@ -3822,32 +3927,20 @@ static void set_insn_start(struct st_translate *t, unsigned start)
  */
 static struct ureg_src
 emit_immediate(struct st_translate *t,
-               struct gl_program_parameter_list *params,
-               int index)
+               gl_constant_value values[4],
+               int type, int size)
 {
    struct ureg_program *ureg = t->ureg;
 
-   switch(params->Parameters[index].DataType)
+   switch(type)
    {
    case GL_FLOAT:
-   case GL_FLOAT_VEC2:
-   case GL_FLOAT_VEC3:
-   case GL_FLOAT_VEC4:
-      return ureg_DECL_immediate(ureg, (float *)params->ParameterValues[index], 4);
+      return ureg_DECL_immediate(ureg, &values[0].f, size);
    case GL_INT:
-   case GL_INT_VEC2:
-   case GL_INT_VEC3:
-   case GL_INT_VEC4:
-      return ureg_DECL_immediate_int(ureg, (int *)params->ParameterValues[index], 4);
+      return ureg_DECL_immediate_int(ureg, &values[0].i, size);
    case GL_UNSIGNED_INT:
-   case GL_UNSIGNED_INT_VEC2:
-   case GL_UNSIGNED_INT_VEC3:
-   case GL_UNSIGNED_INT_VEC4:
    case GL_BOOL:
-   case GL_BOOL_VEC2:
-   case GL_BOOL_VEC3:
-   case GL_BOOL_VEC4:
-      return ureg_DECL_immediate_uint(ureg, (unsigned *)params->ParameterValues[index], 4);
+      return ureg_DECL_immediate_uint(ureg, &values[0].u, size);
    default:
       assert(!"should not get here - type must be float, int, uint, or bool");
       return ureg_src_undef();
@@ -3868,14 +3961,11 @@ dst_register(struct st_translate *t,
 
    case PROGRAM_TEMPORARY:
       if (ureg_dst_is_undef(t->temps[index]))
-         t->temps[index] = ureg_DECL_temporary(t->ureg);
+         t->temps[index] = ureg_DECL_local_temporary(t->ureg);
 
       return t->temps[index];
 
    case PROGRAM_OUTPUT:
-      if (t->procType == TGSI_PROCESSOR_VERTEX && index == VERT_RESULT_PSIZ)
-         t->prevInstWrotePointSize = GL_TRUE;
-
       if (t->procType == TGSI_PROCESSOR_VERTEX)
          assert(index < VERT_RESULT_MAX);
       else if (t->procType == TGSI_PROCESSOR_FRAGMENT)
@@ -3912,7 +4002,7 @@ src_register(struct st_translate *t,
       assert(index >= 0);
       assert(index < Elements(t->temps));
       if (ureg_dst_is_undef(t->temps[index]))
-         t->temps[index] = ureg_DECL_temporary(t->ureg);
+         t->temps[index] = ureg_DECL_local_temporary(t->ureg);
       return ureg_src(t->temps[index]);
 
    case PROGRAM_NAMED_PARAM:
@@ -3958,7 +4048,7 @@ src_register(struct st_translate *t,
 static struct ureg_dst
 translate_dst(struct st_translate *t,
               const st_dst_reg *dst_reg,
-              bool saturate)
+              bool saturate, bool clamp_color)
 {
    struct ureg_dst dst = dst_register(t, 
                                       dst_reg->file,
@@ -3968,6 +4058,27 @@ translate_dst(struct st_translate *t,
    
    if (saturate)
       dst = ureg_saturate(dst);
+   else if (clamp_color && dst_reg->file == PROGRAM_OUTPUT) {
+      /* Clamp colors for ARB_color_buffer_float. */
+      switch (t->procType) {
+      case TGSI_PROCESSOR_VERTEX:
+         /* XXX if the geometry shader is present, this must be done there
+          * instead of here. */
+         if (dst_reg->index == VERT_RESULT_COL0 ||
+             dst_reg->index == VERT_RESULT_COL1 ||
+             dst_reg->index == VERT_RESULT_BFC0 ||
+             dst_reg->index == VERT_RESULT_BFC1) {
+            dst = ureg_saturate(dst);
+         }
+         break;
+
+      case TGSI_PROCESSOR_FRAGMENT:
+         if (dst_reg->index >= FRAG_RESULT_COLOR) {
+            dst = ureg_saturate(dst);
+         }
+         break;
+      }
+   }
 
    if (dst_reg->reladdr != NULL)
       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
@@ -4018,14 +4129,34 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
    return src;
 }
 
+static struct tgsi_texture_offset
+translate_tex_offset(struct st_translate *t,
+                     const struct tgsi_texture_offset *in_offset)
+{
+   struct tgsi_texture_offset offset;
+
+   assert(in_offset->File == PROGRAM_IMMEDIATE);
+
+   offset.File = TGSI_FILE_IMMEDIATE;
+   offset.Index = in_offset->Index;
+   offset.SwizzleX = in_offset->SwizzleX;
+   offset.SwizzleY = in_offset->SwizzleY;
+   offset.SwizzleZ = in_offset->SwizzleZ;
+
+   return offset;
+}
+
 static void
 compile_tgsi_instruction(struct st_translate *t,
-                         const struct glsl_to_tgsi_instruction *inst)
+                         const glsl_to_tgsi_instruction *inst,
+                         bool clamp_dst_color_output)
 {
    struct ureg_program *ureg = t->ureg;
    GLuint i;
    struct ureg_dst dst[1];
    struct ureg_src src[4];
+   struct tgsi_texture_offset texoffsets[MAX_GLSL_TEXTURE_OFFSET];
+
    unsigned num_dst;
    unsigned num_src;
 
@@ -4035,7 +4166,8 @@ compile_tgsi_instruction(struct st_translate *t,
    if (num_dst) 
       dst[0] = translate_dst(t, 
                              &inst->dst,
-                             inst->saturate);
+                             inst->saturate,
+                             clamp_dst_color_output);
 
    for (i = 0; i < num_src; i++) 
       src[i] = translate_src(t, &inst->src[i]);
@@ -4059,11 +4191,17 @@ compile_tgsi_instruction(struct st_translate *t,
    case TGSI_OPCODE_TXD:
    case TGSI_OPCODE_TXL:
    case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXQ:
+   case TGSI_OPCODE_TXF:
       src[num_src++] = t->samplers[inst->sampler];
+      for (i = 0; i < inst->tex_offset_num_offset; i++) {
+         texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
+      }
       ureg_tex_insn(ureg,
                     inst->op,
                     dst, num_dst, 
-                    translate_texture_target(inst->tex_target, inst->tex_shadow),
+                    st_translate_texture_target(inst->tex_target, inst->tex_shadow),
+                    texoffsets, inst->tex_offset_num_offset,
                     src, num_src);
       return;
 
@@ -4082,37 +4220,15 @@ compile_tgsi_instruction(struct st_translate *t,
 }
 
 /**
- * Emit the TGSI instructions to adjust the WPOS pixel center convention
- * Basically, add (adjX, adjY) to the fragment position.
- */
-static void
-emit_adjusted_wpos(struct st_translate *t,
-                   const struct gl_program *program,
-                   float adjX, float adjY)
-{
-   struct ureg_program *ureg = t->ureg;
-   struct ureg_dst wpos_temp = ureg_DECL_temporary(ureg);
-   struct ureg_src wpos_input = t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]];
-
-   /* Note that we bias X and Y and pass Z and W through unchanged.
-    * The shader might also use gl_FragCoord.w and .z.
-    */
-   ureg_ADD(ureg, wpos_temp, wpos_input,
-            ureg_imm4f(ureg, adjX, adjY, 0.0f, 0.0f));
-
-   t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]] = ureg_src(wpos_temp);
-}
-
-
-/**
- * Emit the TGSI instructions for inverting the WPOS y coordinate.
+ * Emit the TGSI instructions for inverting and adjusting WPOS.
  * This code is unavoidable because it also depends on whether
  * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
  */
 static void
-emit_wpos_inversion(struct st_translate *t,
-                    const struct gl_program *program,
-                    bool invert)
+emit_wpos_adjustment( struct st_translate *t,
+                      const struct gl_program *program,
+                      boolean invert,
+                      GLfloat adjX, GLfloat adjY[2])
 {
    struct ureg_program *ureg = t->ureg;
 
@@ -4131,35 +4247,55 @@ emit_wpos_inversion(struct st_translate *t,
    unsigned wposTransConst = _mesa_add_state_reference(program->Parameters,
                                                        wposTransformState);
 
-   struct ureg_src wpostrans = ureg_DECL_constant(ureg, wposTransConst);
-   struct ureg_dst wpos_temp;
+   struct ureg_src wpostrans = ureg_DECL_constant( ureg, wposTransConst );
+   struct ureg_dst wpos_temp = ureg_DECL_temporary( ureg );
    struct ureg_src wpos_input = t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]];
 
-   /* MOV wpos_temp, input[wpos]
-    */
-   if (wpos_input.File == TGSI_FILE_TEMPORARY)
-      wpos_temp = ureg_dst(wpos_input);
-   else {
-      wpos_temp = ureg_DECL_temporary(ureg);
-      ureg_MOV(ureg, wpos_temp, wpos_input);
+   /* First, apply the coordinate shift: */
+   if (adjX || adjY[0] || adjY[1]) {
+      if (adjY[0] != adjY[1]) {
+         /* Adjust the y coordinate by adjY[1] or adjY[0] respectively
+          * depending on whether inversion is actually going to be applied
+          * or not, which is determined by testing against the inversion
+          * state variable used below, which will be either +1 or -1.
+          */
+         struct ureg_dst adj_temp = ureg_DECL_local_temporary(ureg);
+
+         ureg_CMP(ureg, adj_temp,
+                  ureg_scalar(wpostrans, invert ? 2 : 0),
+                  ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f),
+                  ureg_imm4f(ureg, adjX, adjY[1], 0.0f, 0.0f));
+         ureg_ADD(ureg, wpos_temp, wpos_input, ureg_src(adj_temp));
+      } else {
+         ureg_ADD(ureg, wpos_temp, wpos_input,
+                  ureg_imm4f(ureg, adjX, adjY[0], 0.0f, 0.0f));
+      }
+      wpos_input = ureg_src(wpos_temp);
+   } else {
+      /* MOV wpos_temp, input[wpos]
+       */
+      ureg_MOV( ureg, wpos_temp, wpos_input );
    }
 
+   /* Now the conditional y flip: STATE_FB_WPOS_Y_TRANSFORM.xy/zw will be
+    * inversion/identity, or the other way around if we're drawing to an FBO.
+    */
    if (invert) {
       /* MAD wpos_temp.y, wpos_input, wpostrans.xxxx, wpostrans.yyyy
        */
-      ureg_MAD(ureg,
-               ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y),
-               wpos_input,
-               ureg_scalar(wpostrans, 0),
-               ureg_scalar(wpostrans, 1));
+      ureg_MAD( ureg,
+                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
+                wpos_input,
+                ureg_scalar(wpostrans, 0),
+                ureg_scalar(wpostrans, 1));
    } else {
       /* MAD wpos_temp.y, wpos_input, wpostrans.zzzz, wpostrans.wwww
        */
-      ureg_MAD(ureg,
-               ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y),
-               wpos_input,
-               ureg_scalar(wpostrans, 2),
-               ureg_scalar(wpostrans, 3));
+      ureg_MAD( ureg,
+                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
+                wpos_input,
+                ureg_scalar(wpostrans, 2),
+                ureg_scalar(wpostrans, 3));
    }
 
    /* Use wpos_temp as position input from here on:
@@ -4180,8 +4316,37 @@ emit_wpos(struct st_context *st,
    const struct gl_fragment_program *fp =
       (const struct gl_fragment_program *) program;
    struct pipe_screen *pscreen = st->pipe->screen;
+   GLfloat adjX = 0.0f;
+   GLfloat adjY[2] = { 0.0f, 0.0f };
    boolean invert = FALSE;
 
+   /* Query the pixel center conventions supported by the pipe driver and set
+    * adjX, adjY to help out if it cannot handle the requested one internally.
+    *
+    * The bias of the y-coordinate depends on whether y-inversion takes place
+    * (adjY[1]) or not (adjY[0]), which is in turn dependent on whether we are
+    * drawing to an FBO (causes additional inversion), and whether the the pipe
+    * driver origin and the requested origin differ (the latter condition is
+    * stored in the 'invert' variable).
+    *
+    * For height = 100 (i = integer, h = half-integer, l = lower, u = upper):
+    *
+    * center shift only:
+    * i -> h: +0.5
+    * h -> i: -0.5
+    *
+    * inversion only:
+    * l,i -> u,i: ( 0.0 + 1.0) * -1 + 100 = 99
+    * l,h -> u,h: ( 0.5 + 0.0) * -1 + 100 = 99.5
+    * u,i -> l,i: (99.0 + 1.0) * -1 + 100 = 0
+    * u,h -> l,h: (99.5 + 0.0) * -1 + 100 = 0.5
+    *
+    * inversion and center shift:
+    * l,i -> u,h: ( 0.0 + 0.5) * -1 + 100 = 99.5
+    * l,h -> u,i: ( 0.5 + 0.5) * -1 + 100 = 99
+    * u,i -> l,h: (99.0 + 0.5) * -1 + 100 = 0.5
+    * u,h -> l,i: (99.5 + 0.5) * -1 + 100 = 0
+    */
    if (fp->OriginUpperLeft) {
       /* Fragment shader wants origin in upper-left */
       if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT)) {
@@ -4209,12 +4374,17 @@ emit_wpos(struct st_context *st,
    
    if (fp->PixelCenterInteger) {
       /* Fragment shader wants pixel center integer */
-      if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER))
+      if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
          /* the driver supports pixel center integer */
+         adjY[1] = 1.0f;
          ureg_property_fs_coord_pixel_center(ureg, TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
-      else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER))
+      }
+      else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER)) {
          /* the driver supports pixel center half integer, need to bias X,Y */
-         emit_adjusted_wpos(t, program, 0.5f, invert ? 0.5f : -0.5f);
+         adjX = -0.5f;
+         adjY[0] = -0.5f;
+         adjY[1] = 0.5f;
+      }
       else
          assert(0);
    }
@@ -4225,8 +4395,8 @@ emit_wpos(struct st_context *st,
       }
       else if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
          /* the driver supports pixel center integer, need to bias X,Y */
+         adjX = adjY[0] = adjY[1] = 0.5f;
          ureg_property_fs_coord_pixel_center(ureg, TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
-         emit_adjusted_wpos(t, program, 0.5f, invert ? -0.5f : 0.5f);
       }
       else
          assert(0);
@@ -4234,7 +4404,7 @@ emit_wpos(struct st_context *st,
 
    /* we invert after adjustment so that we avoid the MOV to temporary,
     * and reuse the adjustment ADD instead */
-   emit_wpos_inversion(t, program, invert);
+   emit_wpos_adjustment(t, program, invert, adjX, adjY);
 }
 
 /**
@@ -4303,24 +4473,37 @@ st_translate_program(
    const GLuint outputMapping[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[],
-   boolean passthrough_edgeflags)
+   boolean passthrough_edgeflags,
+   boolean clamp_color)
 {
-   struct st_translate translate, *t;
+   struct st_translate *t;
    unsigned i;
    enum pipe_error ret = PIPE_OK;
 
    assert(numInputs <= Elements(t->inputs));
    assert(numOutputs <= Elements(t->outputs));
 
-   t = &translate;
+   t = CALLOC_STRUCT(st_translate);
+   if (!t) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out;
+   }
+
    memset(t, 0, sizeof *t);
 
    t->procType = procType;
    t->inputMapping = inputMapping;
    t->outputMapping = outputMapping;
    t->ureg = ureg;
-   t->pointSizeOutIndex = -1;
-   t->prevInstWrotePointSize = GL_FALSE;
+
+   if (program->shader_program) {
+      for (i = 0; i < program->shader_program->NumUserUniformStorage; i++) {
+         struct gl_uniform_storage *const storage =
+               &program->shader_program->UniformStorage[i];
+
+         _mesa_uniform_detach_all_driver_storage(storage);
+      }
+   }
 
    /*
     * Declare input attributes.
@@ -4367,7 +4550,8 @@ st_translate_program(
             break;
          default:
             assert(!"fragment shader outputs must be POSITION/STENCIL/COLOR");
-            return PIPE_ERROR_BAD_INPUT;
+            ret = PIPE_ERROR_BAD_INPUT;
+            goto out;
          }
       }
    }
@@ -4393,27 +4577,16 @@ st_translate_program(
       }
 
       for (i = 0; i < numOutputs; i++) {
-         t->outputs[i] = ureg_DECL_output(ureg,
-                                          outputSemanticName[i],
-                                          outputSemanticIndex[i]);
-         if ((outputSemanticName[i] == TGSI_SEMANTIC_PSIZE) && proginfo->Id) {
-            /* Writing to the point size result register requires special
-             * handling to implement clamping.
-             */
-            static const gl_state_index pointSizeClampState[STATE_LENGTH]
-               = { STATE_INTERNAL, STATE_POINT_SIZE_IMPL_CLAMP, (gl_state_index)0, (gl_state_index)0, (gl_state_index)0 };
-               /* XXX: note we are modifying the incoming shader here!  Need to
-               * do this before emitting the constant decls below, or this
-               * will be missed.
-               */
-            unsigned pointSizeClampConst =
-               _mesa_add_state_reference(proginfo->Parameters,
-                                         pointSizeClampState);
-            struct ureg_dst psizregtemp = ureg_DECL_temporary(ureg);
-            t->pointSizeConst = ureg_DECL_constant(ureg, pointSizeClampConst);
-            t->pointSizeResult = t->outputs[i];
-            t->pointSizeOutIndex = i;
-            t->outputs[i] = psizregtemp;
+         if (outputSemanticName[i] == TGSI_SEMANTIC_CLIPDIST) {
+            int mask = ((1 << (program->num_clip_distances - 4*outputSemanticIndex[i])) - 1) & TGSI_WRITEMASK_XYZW;
+            t->outputs[i] = ureg_DECL_output_masked(ureg,
+                                                    outputSemanticName[i],
+                                                    outputSemanticIndex[i],
+                                                    mask);
+         } else {
+            t->outputs[i] = ureg_DECL_output(ureg,
+                                             outputSemanticName[i],
+                                             outputSemanticIndex[i]);
          }
       }
       if (passthrough_edgeflags)
@@ -4449,7 +4622,7 @@ st_translate_program(
        */
       for (i = 0; i < (unsigned)program->next_temp; i++) {
          /* XXX use TGSI_FILE_TEMPORARY_ARRAY when it's supported by ureg */
-         t->temps[i] = ureg_DECL_temporary(t->ureg);
+         t->temps[i] = ureg_DECL_local_temporary(t->ureg);
       }
    }
 
@@ -4483,7 +4656,10 @@ st_translate_program(
             if (program->indirect_addr_consts)
                t->constants[i] = ureg_DECL_constant(ureg, i);
             else
-               t->constants[i] = emit_immediate(t, proginfo->Parameters, i);
+               t->constants[i] = emit_immediate(t,
+                                                proginfo->Parameters->ParameterValues[i],
+                                                proginfo->Parameters->Parameters[i].DataType,
+                                                4);
             break;
          default:
             break;
@@ -4493,15 +4669,18 @@ st_translate_program(
    
    /* Emit immediate values.
     */
-   t->immediates = (struct ureg_src *)CALLOC(program->immediates->NumParameters * sizeof(struct ureg_src));
+   t->immediates = (struct ureg_src *)CALLOC(program->num_immediates * sizeof(struct ureg_src));
    if (t->immediates == NULL) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out;
    }
-   for (i = 0; i < program->immediates->NumParameters; i++) {
-      assert(program->immediates->Parameters[i].Type == PROGRAM_IMMEDIATE);
-      t->immediates[i] = emit_immediate(t, program->immediates, i);
+   i = 0;
+   foreach_iter(exec_list_iterator, iter, program->immediates) {
+      immediate_storage *imm = (immediate_storage *)iter.get();
+      assert(i < program->num_immediates);
+      t->immediates[i++] = emit_immediate(t, imm->values, imm->type, imm->size);
    }
+   assert(i == program->num_immediates);
 
    /* texture samplers */
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
@@ -4514,26 +4693,8 @@ st_translate_program(
     */
    foreach_iter(exec_list_iterator, iter, program->instructions) {
       set_insn_start(t, ureg_get_instruction_number(ureg));
-      compile_tgsi_instruction(t, (glsl_to_tgsi_instruction *)iter.get());
-
-      if (t->prevInstWrotePointSize && proginfo->Id) {
-         /* The previous instruction wrote to the (fake) vertex point size
-          * result register.  Now we need to clamp that value to the min/max
-          * point size range, putting the result into the real point size
-          * register.
-          * Note that we can't do this easily at the end of program due to
-          * possible early return.
-          */
-         set_insn_start(t, ureg_get_instruction_number(ureg));
-         ureg_MAX(t->ureg,
-                  ureg_writemask(t->outputs[t->pointSizeOutIndex], WRITEMASK_X),
-                  ureg_src(t->outputs[t->pointSizeOutIndex]),
-                  ureg_swizzle(t->pointSizeConst, 1,1,1,1));
-         ureg_MIN(t->ureg, ureg_writemask(t->pointSizeResult, WRITEMASK_X),
-                  ureg_src(t->outputs[t->pointSizeOutIndex]),
-                  ureg_swizzle(t->pointSizeConst, 2,2,2,2));
-      }
-      t->prevInstWrotePointSize = GL_FALSE;
+      compile_tgsi_instruction(t, (glsl_to_tgsi_instruction *)iter.get(),
+                               clamp_color);
    }
 
    /* Fix up all emitted labels:
@@ -4543,14 +4704,32 @@ st_translate_program(
                        t->insn[t->labels[i].branch_target]);
    }
 
+   if (program->shader_program) {
+      /* This has to be done last.  Any operation the can cause
+       * prog->ParameterValues to get reallocated (e.g., anything that adds a
+       * program constant) has to happen before creating this linkage.
+       */
+      for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
+         if (program->shader_program->_LinkedShaders[i] == NULL)
+            continue;
+
+         _mesa_associate_uniform_storage(ctx, program->shader_program,
+               program->shader_program->_LinkedShaders[i]->Program->Parameters);
+      }
+   }
+
 out:
-   FREE(t->insn);
-   FREE(t->labels);
-   FREE(t->constants);
-   FREE(t->immediates);
+   if (t) {
+      FREE(t->insn);
+      FREE(t->labels);
+      FREE(t->constants);
+      FREE(t->immediates);
+
+      if (t->error) {
+         debug_printf("%s: translate error flag set\n", __FUNCTION__);
+      }
 
-   if (t->error) {
-      debug_printf("%s: translate error flag set\n", __FUNCTION__);
+      FREE(t);
    }
 
    return ret;
@@ -4564,7 +4743,8 @@ out:
 static struct gl_program *
 get_mesa_program(struct gl_context *ctx,
                  struct gl_shader_program *shader_program,
-        	 struct gl_shader *shader)
+                 struct gl_shader *shader,
+                 int num_clip_distances)
 {
    glsl_to_tgsi_visitor* v = new glsl_to_tgsi_visitor();
    struct gl_program *prog;
@@ -4598,15 +4778,19 @@ get_mesa_program(struct gl_context *ctx,
    if (!prog)
       return NULL;
    prog->Parameters = _mesa_new_parameter_list();
-   prog->Varying = _mesa_new_parameter_list();
-   prog->Attributes = _mesa_new_parameter_list();
    v->ctx = ctx;
    v->prog = prog;
    v->shader_program = shader_program;
    v->options = options;
    v->glsl_version = ctx->Const.GLSLVersion;
+   v->native_integers = ctx->Const.NativeIntegers;
+   v->num_clip_distances = num_clip_distances;
+
+   _mesa_generate_parameters_list_for_uniforms(shader_program, shader,
+					       prog->Parameters);
 
-   add_uniforms_to_parameters_list(shader_program, shader, prog);
+   /* Remove reads from output registers. */
+   lower_output_reads(shader->ir);
 
    /* Emit intermediate IR for main(). */
    visit_exec_list(shader->ir, v);
@@ -4654,11 +4838,6 @@ get_mesa_program(struct gl_context *ctx,
    }
 #endif
 
-   /* Remove reads to output registers, and to varyings in vertex shaders. */
-   v->remove_output_reads(PROGRAM_OUTPUT);
-   if (target == GL_VERTEX_PROGRAM_ARB)
-      v->remove_output_reads(PROGRAM_VARYING);
-   
    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
    v->simplify_cmp();
    v->copy_propagate();
@@ -4685,18 +4864,26 @@ get_mesa_program(struct gl_context *ctx,
       _mesa_print_ir(shader->ir, NULL);
       printf("\n");
       printf("\n");
+      fflush(stdout);
    }
 
    prog->Instructions = NULL;
    prog->NumInstructions = 0;
 
-   do_set_program_inouts(shader->ir, prog);
+   do_set_program_inouts(shader->ir, prog, shader->Type == GL_FRAGMENT_SHADER);
    count_resources(v, prog);
 
-   check_resources(ctx, shader_program, v, prog);
-
    _mesa_reference_program(ctx, &shader->Program, prog);
    
+   /* This has to be done last.  Any operation the can cause
+    * prog->ParameterValues to get reallocated (e.g., anything that adds a
+    * program constant) has to happen before creating this linkage.
+    */
+   _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters);
+   if (!shader_program->LinkStatus) {
+      return NULL;
+   }
+
    struct st_vertex_program *stvp;
    struct st_fragment_program *stfp;
    struct st_geometry_program *stgp;
@@ -4722,6 +4909,25 @@ get_mesa_program(struct gl_context *ctx,
    return prog;
 }
 
+/**
+ * Searches through the IR for a declaration of gl_ClipDistance and returns the
+ * declared size of the gl_ClipDistance array.  Returns 0 if gl_ClipDistance is
+ * not declared in the IR.
+ */
+int get_clip_distance_size(exec_list *ir)
+{
+   foreach_iter (exec_list_iterator, iter, *ir) {
+      ir_instruction *inst = (ir_instruction *)iter.get();
+      ir_variable *var = inst->as_variable();
+      if (var == NULL) continue;
+      if (!strcmp(var->name, "gl_ClipDistance")) {
+         return var->type->length;
+      }
+   }
+   
+   return 0;
+}
+
 extern "C" {
 
 struct gl_shader *
@@ -4760,6 +4966,7 @@ st_new_shader_program(struct gl_context *ctx, GLuint name)
 GLboolean
 st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 {
+   int num_clip_distances[MESA_SHADER_TYPES];
    assert(prog->LinkStatus);
 
    for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
@@ -4771,25 +4978,38 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       const struct gl_shader_compiler_options *options =
             &ctx->ShaderCompilerOptions[_mesa_shader_type_to_index(prog->_LinkedShaders[i]->Type)];
 
+      /* We have to determine the length of the gl_ClipDistance array before
+       * the array is lowered to two vec4s by lower_clip_distance().
+       */
+      num_clip_distances[i] = get_clip_distance_size(ir);
+
       do {
+         unsigned what_to_lower = MOD_TO_FRACT | DIV_TO_MUL_RCP |
+            EXP_TO_EXP2 | LOG_TO_LOG2;
+         if (options->EmitNoPow)
+            what_to_lower |= POW_TO_EXP2;
+         if (!ctx->Const.NativeIntegers)
+            what_to_lower |= INT_DIV_TO_MUL_RCP;
+
          progress = false;
 
          /* Lowering */
          do_mat_op_to_vec(ir);
-         lower_instructions(ir, (MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2
-        			 | LOG_TO_LOG2
-        			 | ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
+         lower_instructions(ir, what_to_lower);
 
          progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress;
 
-         progress = do_common_optimization(ir, true, options->MaxUnrollIterations) || progress;
+         progress = do_common_optimization(ir, true, true,
+					   options->MaxUnrollIterations)
+	   || progress;
 
-         progress = lower_quadop_vector(ir, true) || progress;
+         progress = lower_quadop_vector(ir, false) || progress;
+         progress = lower_clip_distance(ir) || progress;
 
-         if (options->EmitNoIfs) {
+         if (options->MaxIfDepth == 0)
             progress = lower_discard(ir) || progress;
-            progress = lower_if_to_cond_assign(ir) || progress;
-         }
+
+         progress = lower_if_to_cond_assign(ir, options->MaxIfDepth) || progress;
 
          if (options->EmitNoNoise)
             progress = lower_noise(ir) || progress;
@@ -4819,32 +5039,22 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       if (prog->_LinkedShaders[i] == NULL)
          continue;
 
-      linked_prog = get_mesa_program(ctx, prog, prog->_LinkedShaders[i]);
+      linked_prog = get_mesa_program(ctx, prog, prog->_LinkedShaders[i],
+                                     num_clip_distances[i]);
 
       if (linked_prog) {
-         bool ok = true;
-
-         switch (prog->_LinkedShaders[i]->Type) {
-         case GL_VERTEX_SHADER:
-            _mesa_reference_vertprog(ctx, &prog->VertexProgram,
-                                     (struct gl_vertex_program *)linked_prog);
-            ok = ctx->Driver.ProgramStringNotify(ctx, GL_VERTEX_PROGRAM_ARB,
-                                                 linked_prog);
-            break;
-         case GL_FRAGMENT_SHADER:
-            _mesa_reference_fragprog(ctx, &prog->FragmentProgram,
-                                     (struct gl_fragment_program *)linked_prog);
-            ok = ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_PROGRAM_ARB,
-                                                 linked_prog);
-            break;
-         case GL_GEOMETRY_SHADER:
-            _mesa_reference_geomprog(ctx, &prog->GeometryProgram,
-                                     (struct gl_geometry_program *)linked_prog);
-            ok = ctx->Driver.ProgramStringNotify(ctx, GL_GEOMETRY_PROGRAM_NV,
-                                                 linked_prog);
-            break;
-         }
-         if (!ok) {
+	 static const GLenum targets[] = {
+	    GL_VERTEX_PROGRAM_ARB,
+	    GL_FRAGMENT_PROGRAM_ARB,
+	    GL_GEOMETRY_PROGRAM_NV
+	 };
+
+	 _mesa_reference_program(ctx, &prog->_LinkedShaders[i]->Program,
+				 linked_prog);
+         if (!ctx->Driver.ProgramStringNotify(ctx, targets[i], linked_prog)) {
+	    _mesa_reference_program(ctx, &prog->_LinkedShaders[i]->Program,
+				    NULL);
+            _mesa_reference_program(ctx, &linked_prog, NULL);
             return GL_FALSE;
          }
       }
@@ -4855,53 +5065,28 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
    return GL_TRUE;
 }
 
-
-/**
- * Link a GLSL shader program.  Called via glLinkProgram().
- */
 void
-st_glsl_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
+st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
+                                const GLuint outputMapping[],
+                                struct pipe_stream_output_info *so)
 {
-   unsigned int i;
-
-   _mesa_clear_shader_program_data(ctx, prog);
-
-   prog->LinkStatus = GL_TRUE;
-
-   for (i = 0; i < prog->NumShaders; i++) {
-      if (!prog->Shaders[i]->CompileStatus) {
-         fail_link(prog, "linking with uncompiled shader");
-         prog->LinkStatus = GL_FALSE;
-      }
-   }
-
-   prog->Varying = _mesa_new_parameter_list();
-   _mesa_reference_vertprog(ctx, &prog->VertexProgram, NULL);
-   _mesa_reference_fragprog(ctx, &prog->FragmentProgram, NULL);
-   _mesa_reference_geomprog(ctx, &prog->GeometryProgram, NULL);
-
-   if (prog->LinkStatus) {
-      link_shaders(ctx, prog);
-   }
-
-   if (prog->LinkStatus) {
-      if (!ctx->Driver.LinkShader(ctx, prog)) {
-         prog->LinkStatus = GL_FALSE;
-      }
+   unsigned i;
+   struct gl_transform_feedback_info *info =
+      &glsl_to_tgsi->shader_program->LinkedTransformFeedback;
+
+   for (i = 0; i < info->NumOutputs; i++) {
+      so->output[i].register_index =
+         outputMapping[info->Outputs[i].OutputRegister];
+      so->output[i].start_component = info->Outputs[i].ComponentOffset;
+      so->output[i].num_components = info->Outputs[i].NumComponents;
+      so->output[i].output_buffer = info->Outputs[i].OutputBuffer;
+      so->output[i].dst_offset = info->Outputs[i].DstOffset;
    }
 
-   set_uniform_initializers(ctx, prog);
-
-   if (ctx->Shader.Flags & GLSL_DUMP) {
-      if (!prog->LinkStatus) {
-         printf("GLSL shader program %d failed to link\n", prog->Name);
-      }
-
-      if (prog->InfoLog && prog->InfoLog[0] != 0) {
-         printf("GLSL shader program %d info log:\n", prog->Name);
-         printf("%s\n", prog->InfoLog);
-      }
+   for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      so->stride[i] = info->BufferStride[i];
    }
+   so->num_outputs = info->NumOutputs;
 }
 
 } /* extern "C" */