st/mesa: unify TCS, TES, GS st_*_program structures

[mesa.git] / src / mesa / state_tracker / st_glsl_to_tgsi.cpp
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp

index b9ab7ae99191a6f4a144763e67be4b5e66851a52..0a8782ec364f0c7b230102269b5b69ff98646351 100644 (file)
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -52,7 +52,11 @@
  #include "st_program.h"
  #include "st_mesa_to_tgsi.h"
  #include "st_format.h"
+#include "st_glsl_types.h"
+#include "st_nir.h"
+#include "st_shader_cache.h"
  
+#include <algorithm>
  
  #define PROGRAM_ANY_CONST ((1 << PROGRAM_STATE_VAR) |    \
                             (1 << PROGRAM_CONSTANT) |     \
@@ -65,38 +69,57 @@ class st_dst_reg;
  
  static int swizzle_for_size(int size);
  
+static int swizzle_for_type(const glsl_type *type, int component = 0)
+{
+   unsigned num_elements = 4;
+
+   if (type) {
+      type = type->without_array();
+      if (type->is_scalar() || type->is_vector() || type->is_matrix())
+         num_elements = type->vector_elements;
+   }
+
+   int swizzle = swizzle_for_size(num_elements);
+   assert(num_elements + component <= 4);
+
+   swizzle += component * MAKE_SWIZZLE4(1, 1, 1, 1);
+   return swizzle;
+}
+
  /**
   * This struct is a corresponding struct to TGSI ureg_src.
   */
  class st_src_reg {
  public:
-   st_src_reg(gl_register_file file, int index, const glsl_type *type)
+   st_src_reg(gl_register_file file, int index, const glsl_type *type,
+              int component = 0, unsigned array_id = 0)
     {
+      assert(file != PROGRAM_ARRAY || array_id != 0);
        this->file = file;
        this->index = index;
-      if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
-         this->swizzle = swizzle_for_size(type->vector_elements);
-      else
-         this->swizzle = SWIZZLE_XYZW;
+      this->swizzle = swizzle_for_type(type, component);
        this->negate = 0;
+      this->abs = 0;
        this->index2D = 0;
        this->type = type ? type->base_type : GLSL_TYPE_ERROR;
        this->reladdr = NULL;
        this->reladdr2 = NULL;
        this->has_index2 = false;
        this->double_reg2 = false;
-      this->array_id = 0;
+      this->array_id = array_id;
        this->is_double_vertex_input = false;
     }
  
-   st_src_reg(gl_register_file file, int index, int type)
+   st_src_reg(gl_register_file file, int index, enum glsl_base_type type)
     {
+      assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
        this->type = type;
        this->file = file;
        this->index = index;
        this->index2D = 0;
        this->swizzle = SWIZZLE_XYZW;
        this->negate = 0;
+      this->abs = 0;
        this->reladdr = NULL;
        this->reladdr2 = NULL;
        this->has_index2 = false;
@@ -105,14 +128,16 @@ public:
        this->is_double_vertex_input = false;
     }
  
-   st_src_reg(gl_register_file file, int index, int type, int index2D)
+   st_src_reg(gl_register_file file, int index, enum glsl_base_type type, int index2D)
     {
+      assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
        this->type = type;
        this->file = file;
        this->index = index;
        this->index2D = index2D;
        this->swizzle = SWIZZLE_XYZW;
        this->negate = 0;
+      this->abs = 0;
        this->reladdr = NULL;
        this->reladdr2 = NULL;
        this->has_index2 = false;
@@ -129,6 +154,7 @@ public:
        this->index2D = 0;
        this->swizzle = 0;
        this->negate = 0;
+      this->abs = 0;
        this->reladdr = NULL;
        this->reladdr2 = NULL;
        this->has_index2 = false;
@@ -139,29 +165,40 @@ public:
  
     explicit st_src_reg(st_dst_reg reg);
  
-   gl_register_file file; /**< PROGRAM_* from Mesa */
-   int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
-   int index2D;
-   GLuint swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
-   int negate; /**< NEGATE_XYZW mask from mesa */
-   int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
-   /** Register index should be offset by the integer in this reg. */
-   st_src_reg *reladdr;
-   st_src_reg *reladdr2;
-   bool has_index2;
+   int16_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int16_t index2D;
+   uint16_t swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
+   int negate:4; /**< NEGATE_XYZW mask from mesa */
+   unsigned abs:1;
+   enum glsl_base_type type:5; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
+   unsigned has_index2:1;
+   gl_register_file file:5; /**< PROGRAM_* from Mesa */
     /*
      * Is this the second half of a double register pair?
      * currently used for input mapping only.
      */
-   bool double_reg2;
-   unsigned array_id;
-   bool is_double_vertex_input;
+   unsigned double_reg2:1;
+   unsigned is_double_vertex_input:1;
+   unsigned array_id:10;
+
+   /** Register index should be offset by the integer in this reg. */
+   st_src_reg *reladdr;
+   st_src_reg *reladdr2;
+
+   st_src_reg get_abs()
+   {
+      st_src_reg reg = *this;
+      reg.negate = 0;
+      reg.abs = 1;
+      return reg;
+   }
  };
  
  class st_dst_reg {
  public:
-   st_dst_reg(gl_register_file file, int writemask, int type, int index)
+   st_dst_reg(gl_register_file file, int writemask, enum glsl_base_type type, int index)
     {
+      assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
        this->file = file;
        this->index = index;
        this->index2D = 0;
@@ -173,8 +210,9 @@ public:
        this->array_id = 0;
     }
  
-   st_dst_reg(gl_register_file file, int writemask, int type)
+   st_dst_reg(gl_register_file file, int writemask, enum glsl_base_type type)
     {
+      assert(file != PROGRAM_ARRAY); /* need array_id > 0 */
        this->file = file;
        this->index = 0;
        this->index2D = 0;
@@ -201,16 +239,17 @@ public:
  
     explicit st_dst_reg(st_src_reg reg);
  
-   gl_register_file file; /**< PROGRAM_* from Mesa */
-   int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
-   int index2D;
-   int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
-   int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
+   int16_t index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int16_t index2D;
+   gl_register_file file:5; /**< PROGRAM_* from Mesa */
+   unsigned writemask:4; /**< Bitfield of WRITEMASK_[XYZW] */
+   enum glsl_base_type type:5; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
+   unsigned has_index2:1;
+   unsigned array_id:10;
+
     /** Register index should be offset by the integer in this reg. */
     st_src_reg *reladdr;
     st_src_reg *reladdr2;
-   bool has_index2;
-   unsigned array_id;
  };
  
  st_src_reg::st_src_reg(st_dst_reg reg)
@@ -220,6 +259,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
     this->index = reg.index;
     this->swizzle = SWIZZLE_XYZW;
     this->negate = 0;
+   this->abs = 0;
     this->reladdr = reg.reladdr;
     this->index2D = reg.index2D;
     this->reladdr2 = reg.reladdr2;
@@ -246,29 +286,27 @@ class glsl_to_tgsi_instruction : public exec_node {
  public:
     DECLARE_RALLOC_CXX_OPERATORS(glsl_to_tgsi_instruction)
  
-   unsigned op;
     st_dst_reg dst[2];
     st_src_reg src[4];
+   st_src_reg resource; /**< sampler or buffer register */
+   st_src_reg *tex_offsets;
+
     /** Pointer to the ir source this tree came from for debugging */
     ir_instruction *ir;
-   GLboolean cond_update;
-   bool saturate;
-   st_src_reg sampler; /**< sampler register */
-   int sampler_base;
-   int sampler_array_size; /**< 1-based size of sampler array, 1 if not array */
-   int tex_target; /**< One of TEXTURE_*_INDEX */
-   glsl_base_type tex_type;
-   GLboolean tex_shadow;
-   unsigned image_format;
-
-   st_src_reg tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
-   unsigned tex_offset_num_offset;
-   int dead_mask; /**< Used in dead code elimination */
-
-   st_src_reg buffer; /**< buffer register */
-   unsigned buffer_access; /**< buffer access type */
-
-   class function_entry *function; /* Set on TGSI_OPCODE_CAL or TGSI_OPCODE_BGNSUB */
+
+   unsigned op:8; /**< TGSI opcode */
+   unsigned saturate:1;
+   unsigned is_64bit_expanded:1;
+   unsigned sampler_base:5;
+   unsigned sampler_array_size:6; /**< 1-based size of sampler array, 1 if not array */
+   unsigned tex_target:4; /**< One of TEXTURE_*_INDEX */
+   glsl_base_type tex_type:5;
+   unsigned tex_shadow:1;
+   unsigned image_format:9;
+   unsigned tex_offset_num_offset:3;
+   unsigned dead_mask:4; /**< Used in dead code elimination */
+   unsigned buffer_access:3; /**< buffer access type */
+
     const struct tgsi_opcode_info *info;
  };
  
@@ -276,13 +314,19 @@ class variable_storage : public exec_node {
  public:
     variable_storage(ir_variable *var, gl_register_file file, int index,
                      unsigned array_id = 0)
-      : file(file), index(index), var(var), array_id(array_id)
+      : file(file), index(index), component(0), var(var), array_id(array_id)
     {
-      /* empty */
+      assert(file != PROGRAM_ARRAY || array_id != 0);
     }
  
     gl_register_file file;
     int index;
+
+   /* Explicit component location. This is given in terms of the GLSL-style
+    * swizzles where each double is a single component, i.e. for 64-bit types
+    * it can only be 0 or 1.
+    */
+   int component;
     ir_variable *var; /* variable that maps to this, if any */
     unsigned array_id;
  };
@@ -302,60 +346,44 @@ public:
     int type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
  };
  
-class function_entry : public exec_node {
-public:
-   ir_function_signature *sig;
-
-   /**
-    * identifier of this function signature used by the program.
-    *
-    * At the point that TGSI instructions for function calls are
-    * generated, we don't know the address of the first instruction of
-    * the function body.  So we make the BranchTarget that is called a
-    * small integer and rewrite them during set_branchtargets().
-    */
-   int sig_id;
-
-   /**
-    * Pointer to first instruction of the function body.
-    *
-    * Set during function body emits after main() is processed.
-    */
-   glsl_to_tgsi_instruction *bgn_inst;
-
-   /**
-    * Index of the first instruction of the function body in actual TGSI.
-    *
-    * Set after conversion from glsl_to_tgsi_instruction to TGSI.
-    */
-   int inst;
-
-   /** Storage for the return value. */
-   st_src_reg return_reg;
-};
-
-static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
-static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
+static const st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
+static const st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
  
-struct array_decl {
+struct inout_decl {
     unsigned mesa_index;
-   unsigned array_id;
-   unsigned array_size;
-   unsigned array_type;
+   unsigned array_id; /* TGSI ArrayID; 1-based: 0 means not an array */
+   unsigned size;
+   unsigned interp_loc;
+   unsigned gs_out_streams;
+   enum glsl_interp_mode interp;
+   enum glsl_base_type base_type;
+   ubyte usage_mask; /* GLSL-style usage-mask,  i.e. single bit per double */
  };
  
-static unsigned
-find_array_type(struct array_decl *arrays, unsigned count, unsigned array_id)
+static struct inout_decl *
+find_inout_array(struct inout_decl *decls, unsigned count, unsigned array_id)
  {
-   unsigned i;
+   assert(array_id != 0);
  
-   for (i = 0; i < count; i++) {
-      struct array_decl *decl = &arrays[i];
+   for (unsigned i = 0; i < count; i++) {
+      struct inout_decl *decl = &decls[i];
  
        if (array_id == decl->array_id) {
-         return decl->array_type;
+         return decl;
        }
     }
+
+   return NULL;
+}
+
+static enum glsl_base_type
+find_array_type(struct inout_decl *decls, unsigned count, unsigned array_id)
+{
+   if (!array_id)
+      return GLSL_TYPE_ERROR;
+   struct inout_decl *decl = find_inout_array(decls, count, array_id);
+   if (decl)
+      return decl->base_type;
     return GLSL_TYPE_ERROR;
  }
  
@@ -369,12 +397,10 @@ public:
     glsl_to_tgsi_visitor();
     ~glsl_to_tgsi_visitor();
  
-   function_entry *current_function;
-
     struct gl_context *ctx;
     struct gl_program *prog;
     struct gl_shader_program *shader_program;
-   struct gl_shader *shader;
+   struct gl_linked_shader *shader;
     struct gl_shader_compiler_options *options;
  
     int next_temp;
@@ -383,9 +409,11 @@ public:
     unsigned max_num_arrays;
     unsigned next_array;
  
-   struct array_decl input_arrays[PIPE_MAX_SHADER_INPUTS];
+   struct inout_decl inputs[4 * PIPE_MAX_SHADER_INPUTS];
+   unsigned num_inputs;
     unsigned num_input_arrays;
-   struct array_decl output_arrays[PIPE_MAX_SHADER_OUTPUTS];
+   struct inout_decl outputs[4 * PIPE_MAX_SHADER_OUTPUTS];
+   unsigned num_outputs;
     unsigned num_output_arrays;
  
     int num_address_regs;
@@ -404,13 +432,12 @@ public:
     bool have_sqrt;
     bool have_fma;
     bool use_shared_memory;
+   bool has_tex_txf_lz;
  
     variable_storage *find_variable_storage(ir_variable *var);
  
     int add_constant(gl_register_file file, gl_constant_value values[8],
-                    int size, int datatype, GLuint *swizzle_out);
-
-   function_entry *get_function_signature(ir_function_signature *sig);
+                    int size, int datatype, uint16_t *swizzle_out);
  
     st_src_reg get_temp(const glsl_type *type);
     void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
@@ -418,7 +445,7 @@ public:
     st_src_reg st_src_reg_for_double(double val);
     st_src_reg st_src_reg_for_float(float val);
     st_src_reg st_src_reg_for_int(int val);
-   st_src_reg st_src_reg_for_type(int type, int val);
+   st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
  
     /**
      * \name Visit methods
@@ -450,11 +477,14 @@ public:
     virtual void visit(ir_barrier *);
     /*@}*/
  
+   void visit_expression(ir_expression *, st_src_reg *) ATTRIBUTE_NOINLINE;
+
     void visit_atomic_counter_intrinsic(ir_call *);
     void visit_ssbo_intrinsic(ir_call *);
     void visit_membar_intrinsic(ir_call *);
     void visit_shared_intrinsic(ir_call *);
     void visit_image_intrinsic(ir_call *);
+   void visit_generic_intrinsic(ir_call *, unsigned op);
  
     st_src_reg result;
  
@@ -465,10 +495,6 @@ public:
     exec_list immediates;
     unsigned num_immediates;
  
-   /** List of function_entry */
-   exec_list function_signatures;
-   int next_signature_id;
-
     /** List of glsl_to_tgsi_instruction */
     exec_list instructions;
  
@@ -486,7 +512,7 @@ public:
                                        st_src_reg src2 = undef_src,
                                        st_src_reg src3 = undef_src);
  
-   unsigned get_opcode(ir_instruction *ir, unsigned op,
+   unsigned get_opcode(unsigned op,
                      st_dst_reg dst,
                      st_src_reg src0, st_src_reg src1);
  
@@ -510,15 +536,15 @@ public:
     void get_deref_offsets(ir_dereference *ir,
                            unsigned *array_size,
                            unsigned *base,
-                          unsigned *index,
-                          st_src_reg *reladdr);
-  void calc_deref_offsets(ir_dereference *head,
-                          ir_dereference *tail,
+                          uint16_t *index,
+                          st_src_reg *reladdr,
+                          bool opaque);
+  void calc_deref_offsets(ir_dereference *tail,
                            unsigned *array_elements,
-                          unsigned *base,
-                          unsigned *index,
+                          uint16_t *index,
                            st_src_reg *indirect,
                            unsigned *location);
+   st_src_reg canonicalize_gather_offset(st_src_reg offset);
  
     bool try_emit_mad(ir_expression *ir,
                int mul_operand);
@@ -533,6 +559,7 @@ public:
  
     void rename_temp_registers(int num_renames, struct rename_reg_pair *renames);
     void get_first_temp_read(int *first_reads);
+   void get_first_temp_write(int *first_writes);
     void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
     void get_last_temp_write(int *last_writes);
  
@@ -562,10 +589,10 @@ fail_link(struct gl_shader_program *prog, const char *fmt, ...)
  {
     va_list args;
     va_start(args, fmt);
-   ralloc_vasprintf_append(&prog->InfoLog, fmt, args);
+   ralloc_vasprintf_append(&prog->data->InfoLog, fmt, args);
     va_end(args);
  
-   prog->LinkStatus = GL_FALSE;
+   prog->data->LinkStatus = linking_failure;
  }
  
  static int
@@ -625,9 +652,9 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
  {
     glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
     int num_reladdr = 0, i, j;
-   bool dst_is_double[2];
+   bool dst_is_64bit[2];
  
-   op = get_opcode(ir, op, dst, src0, src1);
+   op = get_opcode(op, dst, src0, src1);
  
     /* If we have to do relative addressing, we want to load the ARL
      * reg directly for one of the regs, and preload the other reladdr
@@ -658,6 +685,9 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     }
     assert(num_reladdr == 0);
  
+   /* inst->op has only 8 bits. */
+   STATIC_ASSERT(TGSI_OPCODE_LAST <= 255);
+
     inst->op = op;
     inst->info = tgsi_get_opcode_info(op);
     inst->dst[0] = dst;
@@ -666,15 +696,18 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     inst->src[1] = src1;
     inst->src[2] = src2;
     inst->src[3] = src3;
+   inst->is_64bit_expanded = false;
     inst->ir = ir;
     inst->dead_mask = 0;
+   inst->tex_offsets = NULL;
+   inst->tex_offset_num_offset = 0;
+   inst->saturate = 0;
+   inst->tex_shadow = 0;
     /* default to float, for paths where this is not initialized
      * (since 0==UINT which is likely wrong):
      */
     inst->tex_type = GLSL_TYPE_FLOAT;
  
-   inst->function = NULL;
-
     /* Update indirect addressing status used by TGSI */
     if (dst.reladdr || dst.reladdr2) {
        switch(dst.file) {
@@ -728,18 +761,16 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
      * GLSL [0].w -> TGSI [1].zw
      */
     for (j = 0; j < 2; j++) {
-      dst_is_double[j] = false;
-      if (inst->dst[j].type == GLSL_TYPE_DOUBLE)
-         dst_is_double[j] = true;
-      else if (inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
-         unsigned type = find_array_type(this->output_arrays, this->num_output_arrays, inst->dst[j].array_id);
-         if (type == GLSL_TYPE_DOUBLE)
-            dst_is_double[j] = true;
+      dst_is_64bit[j] = glsl_base_type_is_64bit(inst->dst[j].type);
+      if (!dst_is_64bit[j] && inst->dst[j].file == PROGRAM_OUTPUT && inst->dst[j].type == GLSL_TYPE_ARRAY) {
+         enum glsl_base_type type = find_array_type(this->outputs, this->num_outputs, inst->dst[j].array_id);
+         if (glsl_base_type_is_64bit(type))
+            dst_is_64bit[j] = true;
        }
     }
  
-   if (dst_is_double[0] || dst_is_double[1] ||
-       inst->src[0].type == GLSL_TYPE_DOUBLE) {
+   if (dst_is_64bit[0] || dst_is_64bit[1] ||
+       glsl_base_type_is_64bit(inst->src[0].type)) {
        glsl_to_tgsi_instruction *dinst = NULL;
        int initial_src_swz[4], initial_src_idx[4];
        int initial_dst_idx[2], initial_dst_writemask[2];
@@ -766,9 +797,9 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
  
           int i = u_bit_scan(&writemask);
  
-         /* before emitting the instruction, see if we have to adjust store
+         /* before emitting the instruction, see if we have to adjust load / store
            * address */
-         if (i > 1 && inst->op == TGSI_OPCODE_STORE &&
+         if (i > 1 && (inst->op == TGSI_OPCODE_LOAD || inst->op == TGSI_OPCODE_STORE) &&
               addr.file == PROGRAM_UNDEFINED) {
              /* We have to advance the buffer address by 16 */
              addr = get_temp(glsl_type::uint_type);
@@ -776,7 +807,6 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
                       inst->src[0], st_src_reg_for_int(16));
           }
  
-
           /* first time use previous instruction */
           if (dinst == NULL) {
              dinst = inst;
@@ -788,18 +818,18 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
              dinst->prev = NULL;
           }
           this->instructions.push_tail(dinst);
+         dinst->is_64bit_expanded = true;
  
           /* modify the destination if we are splitting */
           for (j = 0; j < 2; j++) {
-            if (dst_is_double[j]) {
+            if (dst_is_64bit[j]) {
                 dinst->dst[j].writemask = (i & 1) ? WRITEMASK_ZW : WRITEMASK_XY;
                 dinst->dst[j].index = initial_dst_idx[j];
                 if (i > 1) {
-                  if (dinst->op == TGSI_OPCODE_STORE) {
+                  if (dinst->op == TGSI_OPCODE_LOAD || dinst->op == TGSI_OPCODE_STORE)
                       dinst->src[0] = addr;
-                  } else {
+                  if (dinst->op != TGSI_OPCODE_STORE)
                       dinst->dst[j].index++;
-                  }
                 }
              } else {
                 /* if we aren't writing to a double, just get the bit of the initial writemask
@@ -812,7 +842,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
           for (j = 0; j < 4; j++) {
              int swz = GET_SWZ(initial_src_swz[j], i);
  
-            if (dinst->src[j].type == GLSL_TYPE_DOUBLE) {
+            if (glsl_base_type_is_64bit(dinst->src[j].type)) {
                 dinst->src[j].index = initial_src_idx[j];
                 if (swz > 1) {
                    dinst->src[j].double_reg2 = true;
@@ -826,10 +856,11 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
  
              } else {
                 /* some opcodes are special case in what they use as sources
-                  - F2D is a float src0, DLDEXP is integer src1 */
-               if (op == TGSI_OPCODE_F2D ||
+                  - [FUI]2D/[UI]2I64 is a float/[u]int src0, DLDEXP is integer src1 */
+               if (op == TGSI_OPCODE_F2D || op == TGSI_OPCODE_U2D || op == TGSI_OPCODE_I2D ||
+                   op == TGSI_OPCODE_I2I64 || op == TGSI_OPCODE_U2I64 ||
                     op == TGSI_OPCODE_DLDEXP ||
-                   (op == TGSI_OPCODE_UCMP && dst_is_double[0])) {
+                   (op == TGSI_OPCODE_UCMP && dst_is_64bit[0])) {
                    dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
                 }
              }
@@ -858,11 +889,11 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
   * based on the operands and input opcode, then emits the result.
   */
  unsigned
-glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
+glsl_to_tgsi_visitor::get_opcode(unsigned op,
                                   st_dst_reg dst,
                                   st_src_reg src0, st_src_reg src1)
  {
-   int type = GLSL_TYPE_FLOAT;
+   enum glsl_base_type type = GLSL_TYPE_FLOAT;
  
     if (op == TGSI_OPCODE_MOV)
         return op;
@@ -874,6 +905,10 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
  
     if (is_resource_instruction(op))
        type = src1.type;
+   else if (src0.type == GLSL_TYPE_INT64 || src1.type == GLSL_TYPE_INT64)
+      type = GLSL_TYPE_INT64;
+   else if (src0.type == GLSL_TYPE_UINT64 || src1.type == GLSL_TYPE_UINT64)
+      type = GLSL_TYPE_UINT64;
     else if (src0.type == GLSL_TYPE_DOUBLE || src1.type == GLSL_TYPE_DOUBLE)
        type = GLSL_TYPE_DOUBLE;
     else if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
@@ -881,6 +916,21 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
     else if (native_integers)
        type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
  
+#define case7(c, f, i, u, d, i64, ui64)             \
+   case TGSI_OPCODE_##c: \
+      if (type == GLSL_TYPE_UINT64)           \
+         op = TGSI_OPCODE_##ui64; \
+      else if (type == GLSL_TYPE_INT64)       \
+         op = TGSI_OPCODE_##i64; \
+      else if (type == GLSL_TYPE_DOUBLE)       \
+         op = TGSI_OPCODE_##d; \
+      else if (type == GLSL_TYPE_INT)       \
+         op = TGSI_OPCODE_##i; \
+      else if (type == GLSL_TYPE_UINT) \
+         op = TGSI_OPCODE_##u; \
+      else \
+         op = TGSI_OPCODE_##f; \
+      break;
  #define case5(c, f, i, u, d)                    \
     case TGSI_OPCODE_##c: \
        if (type == GLSL_TYPE_DOUBLE)           \
@@ -904,14 +954,22 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
        break;
  
  #define case3(f, i, u)  case4(f, f, i, u)
-#define case4d(f, i, u, d)  case5(f, f, i, u, d)
+#define case6d(f, i, u, d, i64, u64)  case7(f, f, i, u, d, i64, u64)
  #define case3fid(f, i, d) case5(f, f, i, i, d)
+#define case3fid64(f, i, d, i64) case7(f, f, i, i, d, i64, i64)
  #define case2fi(f, i)   case4(f, f, i, i)
  #define case2iu(i, u)   case4(i, LAST, i, u)
  
-#define casecomp(c, f, i, u, d)                   \
+#define case2iu64(i, i64)   case7(i, LAST, i, i, LAST, i64, i64)
+#define case4iu64(i, u, i64, u64)   case7(i, LAST, i, u, LAST, i64, u64)
+
+#define casecomp(c, f, i, u, d, i64, ui64)           \
     case TGSI_OPCODE_##c: \
-      if (type == GLSL_TYPE_DOUBLE) \
+      if (type == GLSL_TYPE_INT64)             \
+         op = TGSI_OPCODE_##i64; \
+      else if (type == GLSL_TYPE_UINT64)        \
+         op = TGSI_OPCODE_##ui64; \
+      else if (type == GLSL_TYPE_DOUBLE)       \
           op = TGSI_OPCODE_##d; \
        else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
           op = TGSI_OPCODE_##i; \
@@ -924,24 +982,24 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
        break;
  
     switch(op) {
-      case3fid(ADD, UADD, DADD);
-      case3fid(MUL, UMUL, DMUL);
+      case3fid64(ADD, UADD, DADD, U64ADD);
+      case3fid64(MUL, UMUL, DMUL, U64MUL);
        case3fid(MAD, UMAD, DMAD);
        case3fid(FMA, UMAD, DFMA);
-      case3(DIV, IDIV, UDIV);
-      case4d(MAX, IMAX, UMAX, DMAX);
-      case4d(MIN, IMIN, UMIN, DMIN);
-      case2iu(MOD, UMOD);
+      case6d(DIV, IDIV, UDIV, DDIV, I64DIV, U64DIV);
+      case6d(MAX, IMAX, UMAX, DMAX, I64MAX, U64MAX);
+      case6d(MIN, IMIN, UMIN, DMIN, I64MIN, U64MIN);
+      case4iu64(MOD, UMOD, I64MOD, U64MOD);
  
-      casecomp(SEQ, FSEQ, USEQ, USEQ, DSEQ);
-      casecomp(SNE, FSNE, USNE, USNE, DSNE);
-      casecomp(SGE, FSGE, ISGE, USGE, DSGE);
-      casecomp(SLT, FSLT, ISLT, USLT, DSLT);
+      casecomp(SEQ, FSEQ, USEQ, USEQ, DSEQ, U64SEQ, U64SEQ);
+      casecomp(SNE, FSNE, USNE, USNE, DSNE, U64SNE, U64SNE);
+      casecomp(SGE, FSGE, ISGE, USGE, DSGE, I64SGE, U64SGE);
+      casecomp(SLT, FSLT, ISLT, USLT, DSLT, I64SLT, U64SLT);
  
-      case2iu(ISHR, USHR);
+      case2iu64(SHL, U64SHL);
+      case4iu64(ISHR, USHR, I64SHR, U64SHR);
  
-      case3fid(SSG, ISSG, DSSG);
-      case3fid(ABS, IABS, DABS);
+      case3fid64(SSG, ISSG, DSSG, I64SSG);
  
        case2iu(IBFE, UBFE);
        case2iu(IMSB, UMSB);
@@ -1062,18 +1120,24 @@ glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
  int
  glsl_to_tgsi_visitor::add_constant(gl_register_file file,
                                     gl_constant_value values[8], int size, int datatype,
-                                   GLuint *swizzle_out)
+                                   uint16_t *swizzle_out)
  {
     if (file == PROGRAM_CONSTANT) {
-      return _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
-                                              size, datatype, swizzle_out);
+      GLuint swizzle = swizzle_out ? *swizzle_out : 0;
+      int result = _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
+                                                    size, datatype, &swizzle);
+      if (swizzle_out)
+         *swizzle_out = swizzle;
+      return result;
     }
  
     assert(file == PROGRAM_IMMEDIATE);
  
     int index = 0;
     immediate_storage *entry;
-   int size32 = size * (datatype == GL_DOUBLE ? 2 : 1);
+   int size32 = size * ((datatype == GL_DOUBLE ||
+                         datatype == GL_INT64_ARB ||
+                         datatype == GL_UNSIGNED_INT64_ARB)? 2 : 1);
     int i;
  
     /* Search immediate storage to see if we already have an identical
@@ -1129,10 +1193,9 @@ glsl_to_tgsi_visitor::st_src_reg_for_double(double val)
     st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_DOUBLE);
     union gl_constant_value uval[2];
  
-   uval[0].u = *(uint32_t *)&val;
-   uval[1].u = *(((uint32_t *)&val) + 1);
+   memcpy(uval, &val, sizeof(uval));
     src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
-
+   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
     return src;
  }
  
@@ -1151,7 +1214,7 @@ glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
  }
  
  st_src_reg
-glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
+glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
  {
     if (native_integers)
        return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) :
@@ -1163,72 +1226,13 @@ glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
  static int
  attrib_type_size(const struct glsl_type *type, bool is_vs_input)
  {
-   unsigned int i;
-   int size;
-
-   switch (type->base_type) {
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_BOOL:
-      if (type->is_matrix()) {
-         return type->matrix_columns;
-      } else {
-         /* Regardless of size of vector, it gets a vec4. This is bad
-          * packing for things like floats, but otherwise arrays become a
-          * mess.  Hopefully a later pass over the code can pack scalars
-          * down if appropriate.
-          */
-         return 1;
-      }
-      break;
-   case GLSL_TYPE_DOUBLE:
-      if (type->is_matrix()) {
-         if (type->vector_elements <= 2 || is_vs_input)
-            return type->matrix_columns;
-         else
-            return type->matrix_columns * 2;
-      } else {
-         /* For doubles if we have a double or dvec2 they fit in one
-          * vec4, else they need 2 vec4s.
-          */
-         if (type->vector_elements <= 2 || is_vs_input)
-            return 1;
-         else
-            return 2;
-      }
-      break;
-   case GLSL_TYPE_ARRAY:
-      assert(type->length > 0);
-      return attrib_type_size(type->fields.array, is_vs_input) * type->length;
-   case GLSL_TYPE_STRUCT:
-      size = 0;
-      for (i = 0; i < type->length; i++) {
-         size += attrib_type_size(type->fields.structure[i].type, is_vs_input);
-      }
-      return size;
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_SUBROUTINE:
-      /* Samplers take up one slot in UNIFORMS[], but they're baked in
-       * at link time.
-       */
-      return 1;
-   case GLSL_TYPE_ATOMIC_UINT:
-   case GLSL_TYPE_INTERFACE:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_FUNCTION:
-      assert(!"Invalid type in type_size");
-      break;
-   }
-   return 0;
+   return st_glsl_attrib_type_size(type, is_vs_input);
  }
  
  static int
  type_size(const struct glsl_type *type)
  {
-  return attrib_type_size(type, false);
+   return st_glsl_type_size(type);
  }
  
  /**
@@ -1271,6 +1275,7 @@ glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
     src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
     src.reladdr = NULL;
     src.negate = 0;
+   src.abs = 0;
  
     if (!options->EmitNoIndirectTemp && type_has_array_or_matrix(type)) {
        if (next_array >= max_num_arrays) {
@@ -1280,7 +1285,8 @@ glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
        }
  
        src.file = PROGRAM_ARRAY;
-      src.index = next_array << 16 | 0x8000;
+      src.index = 0;
+      src.array_id = next_array + 1;
        array_sizes[next_array] = type_size(type);
        ++next_array;
  
@@ -1315,10 +1321,8 @@ void
  glsl_to_tgsi_visitor::visit(ir_variable *ir)
  {
     if (strcmp(ir->name, "gl_FragCoord") == 0) {
-      struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
-
-      fp->OriginUpperLeft = ir->data.origin_upper_left;
-      fp->PixelCenterInteger = ir->data.pixel_center_integer;
+      this->prog->OriginUpperLeft = ir->data.origin_upper_left;
+      this->prog->PixelCenterInteger = ir->data.pixel_center_integer;
     }
  
     if (ir->data.mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
@@ -1355,7 +1359,8 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
  
           dst = st_dst_reg(get_temp(ir->type));
  
-         storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index);
+         storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index,
+                                                 dst.array_id);
  
           this->variables.push_tail(storage);
        }
@@ -1535,10 +1540,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
  void
  glsl_to_tgsi_visitor::visit(ir_expression *ir)
  {
-   unsigned int operand;
     st_src_reg op[ARRAY_SIZE(ir->operands)];
-   st_src_reg result_src;
-   st_dst_reg result_dst;
  
     /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
      */
@@ -1561,7 +1563,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
     if (ir->operation == ir_quadop_vector)
        assert(!"ir_quadop_vector should have been lowered");
  
-   for (operand = 0; operand < ir->get_num_operands(); operand++) {
+   for (unsigned int operand = 0; operand < ir->get_num_operands(); operand++) {
        this->result.file = PROGRAM_UNDEFINED;
        ir->operands[operand]->accept(this);
        if (this->result.file == PROGRAM_UNDEFINED) {
@@ -1578,6 +1580,19 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        assert(!ir->operands[operand]->type->is_matrix());
     }
  
+   visit_expression(ir, op);
+}
+
+/* The non-recursive part of the expression visitor lives in a separate
+ * function and should be prevented from being inlined, to avoid a stack
+ * explosion when deeply nested expressions are visited.
+ */
+void
+glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
+{
+   st_src_reg result_src;
+   st_dst_reg result_dst;
+
     int vector_elements = ir->operands[0]->type->vector_elements;
     if (ir->operands[1]) {
        vector_elements = MAX2(vector_elements,
@@ -1613,7 +1628,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        }
        break;
     case ir_unop_neg:
-      if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
+      if (result_dst.type == GLSL_TYPE_INT64 || result_dst.type == GLSL_TYPE_UINT64)
+         emit_asm(ir, TGSI_OPCODE_I64NEG, result_dst, op[0]);
+      else if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
           emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
        else if (result_dst.type == GLSL_TYPE_DOUBLE)
           emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
@@ -1626,7 +1643,14 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
        break;
     case ir_unop_abs:
-      emit_asm(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
+      if (result_dst.type == GLSL_TYPE_FLOAT)
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0].get_abs());
+      else if (result_dst.type == GLSL_TYPE_DOUBLE)
+         emit_asm(ir, TGSI_OPCODE_DABS, result_dst, op[0]);
+      else if (result_dst.type == GLSL_TYPE_INT64 || result_dst.type == GLSL_TYPE_UINT64)
+         emit_asm(ir, TGSI_OPCODE_I64ABS, result_dst, op[0]);
+      else
+         emit_asm(ir, TGSI_OPCODE_IABS, result_dst, op[0]);
        break;
     case ir_unop_sign:
        emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
@@ -1639,8 +1663,10 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
        break;
     case ir_unop_exp:
+      assert(!"not reached: should be handled by exp_to_exp2");
+      break;
     case ir_unop_log:
-      assert(!"not reached: should be handled by ir_explog_to_explog2");
+      assert(!"not reached: should be handled by log_to_log2");
        break;
     case ir_unop_log2:
        emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
@@ -1716,17 +1742,15 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
        break;
     case ir_binop_sub:
-      emit_asm(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
+      op[1].negate = ~op[1].negate;
+      emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
        break;
  
     case ir_binop_mul:
        emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
        break;
     case ir_binop_div:
-      if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
-         assert(!"not reached: should be handled by ir_div_to_mul_rcp");
-      else
-         emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
        break;
     case ir_binop_mod:
        if (result_dst.type == GLSL_TYPE_FLOAT)
@@ -1946,13 +1970,14 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        if (have_sqrt) {
           emit_scalar(ir, TGSI_OPCODE_SQRT, result_dst, op[0]);
        } else {
-         /* sqrt(x) = x * rsq(x). */
-         emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
-         emit_asm(ir, TGSI_OPCODE_MUL, result_dst, result_src, op[0]);
-         /* For incoming channels <= 0, set the result to 0. */
-         op[0].negate = ~op[0].negate;
-         emit_asm(ir, TGSI_OPCODE_CMP, result_dst,
-              op[0], result_src, st_src_reg_for_float(0.0));
+         /* This is the only instruction sequence that makes the game "Risen"
+          * render correctly. ABS is not required for the game, but since GLSL
+          * declares negative values as "undefined", allowing us to do whatever
+          * we want, I choose to use ABS to match DX9 and pre-GLSL RSQ
+          * behavior.
+          */
+         emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0].get_abs());
+         emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, result_src);
        }
        break;
     case ir_unop_rsq:
@@ -1972,6 +1997,8 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        /* fallthrough to next case otherwise */
     case ir_unop_i2u:
     case ir_unop_u2i:
+   case ir_unop_i642u64:
+   case ir_unop_u642i64:
        /* Converting between signed and unsigned integers is a no-op. */
        result_src = op[0];
        result_src.type = result_dst.type;
@@ -2003,12 +2030,14 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
        break;
     case ir_unop_bitcast_f2i:
-      result_src = op[0];
-      result_src.type = GLSL_TYPE_INT;
-      break;
     case ir_unop_bitcast_f2u:
-      result_src = op[0];
-      result_src.type = GLSL_TYPE_UINT;
+      /* Make sure we don't propagate the negate modifier to integer opcodes. */
+      if (op[0].negate || op[0].abs)
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      else
+         result_src = op[0];
+      result_src.type = ir->operation == ir_unop_bitcast_f2i ? GLSL_TYPE_INT :
+                                                               GLSL_TYPE_UINT;
        break;
     case ir_unop_bitcast_i2f:
     case ir_unop_bitcast_u2f:
@@ -2027,6 +2056,19 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        else
           emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
        break;
+   case ir_unop_bitcast_u642d:
+   case ir_unop_bitcast_i642d:
+      result_src = op[0];
+      result_src.type = GLSL_TYPE_DOUBLE;
+      break;
+   case ir_unop_bitcast_d2i64:
+      result_src = op[0];
+      result_src.type = GLSL_TYPE_INT64;
+      break;
+   case ir_unop_bitcast_d2u64:
+      result_src = op[0];
+      result_src.type = GLSL_TYPE_UINT64;
+      break;
     case ir_unop_trunc:
        emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
        break;
@@ -2064,13 +2106,23 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           break;
        }
     case ir_binop_lshift:
-      if (native_integers) {
-         emit_asm(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
-         break;
-      }
     case ir_binop_rshift:
        if (native_integers) {
-         emit_asm(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
+         unsigned opcode = ir->operation == ir_binop_lshift ? TGSI_OPCODE_SHL
+                                                            : TGSI_OPCODE_ISHR;
+         st_src_reg count;
+
+         if (glsl_base_type_is_64bit(op[0].type)) {
+            /* GLSL shift operations have 32-bit shift counts, but TGSI uses
+             * 64 bits.
+             */
+            count = get_temp(glsl_type::u64vec(ir->operands[1]->type->components()));
+            emit_asm(ir, TGSI_OPCODE_U2I64, st_dst_reg(count), op[1]);
+         } else {
+            count = op[1];
+         }
+
+         emit_asm(ir, opcode, result_dst, op[0], count);
           break;
        }
     case ir_binop_bit_and:
@@ -2105,6 +2157,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        cbuf.index = 0;
        cbuf.reladdr = NULL;
        cbuf.negate = 0;
+      cbuf.abs = 0;
  
        assert(ir->type->is_vector() || ir->type->is_scalar());
  
@@ -2114,8 +2167,35 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           cbuf.index = const_offset / 16;
        }
        else {
+         ir_expression *offset_expr = ir->operands[1]->as_expression();
+         st_src_reg offset = op[1];
+
+         /* The OpenGL spec is written in such a way that accesses with
+          * non-constant offset are almost always vec4-aligned. The only
+          * exception to this are members of structs in arrays of structs:
+          * each struct in an array of structs is at least vec4-aligned,
+          * but single-element and [ui]vec2 members of the struct may be at
+          * an offset that is not a multiple of 16 bytes.
+          *
+          * Here, we extract that offset, relying on previous passes to always
+          * generate offset expressions of the form (+ expr constant_offset).
+          *
+          * Note that the std430 layout, which allows more cases of alignment
+          * less than vec4 in arrays, is not supported for uniform blocks, so
+          * we do not have to deal with it here.
+          */
+         if (offset_expr && offset_expr->operation == ir_binop_add) {
+            const_offset_ir = offset_expr->operands[1]->as_constant();
+            if (const_offset_ir) {
+               const_offset = const_offset_ir->value.u[0];
+               cbuf.index = const_offset / 16;
+               offset_expr->operands[0]->accept(this);
+               offset = this->result;
+            }
+         }
+
           /* Relative/variable index into constant buffer */
-         emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), op[1],
+         emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
                st_src_reg_for_int(4));
           cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
           memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
@@ -2136,7 +2216,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        }
  
        cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
-      if (cbuf.type == GLSL_TYPE_DOUBLE)
+      if (glsl_base_type_is_64bit(cbuf.type))
           cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
                                         const_offset % 16 / 8,
                                         const_offset % 16 / 8,
@@ -2147,7 +2227,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                         const_offset % 16 / 4,
                                         const_offset % 16 / 4);
  
-      if (ir->type->base_type == GLSL_TYPE_BOOL) {
+      if (ir->type->is_boolean()) {
           emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
        } else {
           emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
@@ -2197,9 +2277,29 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
     case ir_unop_interpolate_at_centroid:
        emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
        break;
-   case ir_binop_interpolate_at_offset:
-      emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], op[1]);
+   case ir_binop_interpolate_at_offset: {
+      /* The y coordinate needs to be flipped for the default fb */
+      static const gl_state_index transform_y_state[STATE_LENGTH]
+         = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };
+
+      unsigned transform_y_index =
+         _mesa_add_state_reference(this->prog->Parameters,
+                                   transform_y_state);
+
+      st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
+                                          transform_y_index,
+                                          glsl_type::vec4_type);
+      transform_y.swizzle = SWIZZLE_XXXX;
+
+      st_src_reg temp = get_temp(glsl_type::vec2_type);
+      st_dst_reg temp_dst = st_dst_reg(temp);
+
+      emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[1]);
+      temp_dst.writemask = WRITEMASK_Y;
+      emit_asm(ir, TGSI_OPCODE_MUL, temp_dst, transform_y, op[1]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], temp);
        break;
+   }
     case ir_binop_interpolate_at_sample:
        emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
        break;
@@ -2224,11 +2324,15 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        break;
     case ir_unop_unpack_double_2x32:
     case ir_unop_pack_double_2x32:
+   case ir_unop_unpack_int_2x32:
+   case ir_unop_pack_int_2x32:
+   case ir_unop_unpack_uint_2x32:
+   case ir_unop_pack_uint_2x32:
        emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
        break;
  
     case ir_binop_ldexp:
-      if (ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE) {
+      if (ir->operands[0]->type->is_double()) {
           emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
        } else {
           assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
@@ -2251,13 +2355,127 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              GLSL_TYPE_UINT);
        if (!const_offset) {
           buffer.reladdr = ralloc(mem_ctx, st_src_reg);
-         memcpy(buffer.reladdr, &sampler_reladdr, sizeof(sampler_reladdr));
+         *buffer.reladdr = op[0];
           emit_arl(ir, sampler_reladdr, op[0]);
        }
-      emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->buffer = buffer;
+      emit_asm(ir, TGSI_OPCODE_RESQ, result_dst)->resource = buffer;
+      break;
+   }
+
+   case ir_unop_u2i64:
+   case ir_unop_u2u64:
+   case ir_unop_b2i64: {
+      st_src_reg temp = get_temp(glsl_type::uvec4_type);
+      st_dst_reg temp_dst = st_dst_reg(temp);
+      unsigned orig_swz = op[0].swizzle;
+      /* 
+       * To convert unsigned to 64-bit:
+       * zero Y channel, copy X channel.
+       */
+      temp_dst.writemask = WRITEMASK_Y;
+      if (vector_elements > 1)
+         temp_dst.writemask |= WRITEMASK_W;
+      emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
+      temp_dst.writemask = WRITEMASK_X;
+      if (vector_elements > 1)
+          temp_dst.writemask |= WRITEMASK_Z;
+      op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 0), GET_SWZ(orig_swz, 0),
+                                    GET_SWZ(orig_swz, 1), GET_SWZ(orig_swz, 1));
+      if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
+         emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
+      else
+         emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0], st_src_reg_for_int(1));
+      result_src = temp;
+      result_src.type = GLSL_TYPE_UINT64;
+      if (vector_elements > 2) {
+         /* Subtle: We rely on the fact that get_temp here returns the next
+          * TGSI temporary register directly after the temp register used for
+          * the first two components, so that the result gets picked up
+          * automatically.
+          */
+         st_src_reg temp = get_temp(glsl_type::uvec4_type);
+         st_dst_reg temp_dst = st_dst_reg(temp);
+         temp_dst.writemask = WRITEMASK_Y;
+         if (vector_elements > 3)
+            temp_dst.writemask |= WRITEMASK_W;
+         emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, st_src_reg_for_int(0));
+
+         temp_dst.writemask = WRITEMASK_X;
+         if (vector_elements > 3)
+            temp_dst.writemask |= WRITEMASK_Z;
+         op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(orig_swz, 2), GET_SWZ(orig_swz, 2),
+                                       GET_SWZ(orig_swz, 3), GET_SWZ(orig_swz, 3));
+         if (ir->operation == ir_unop_u2i64 || ir->operation == ir_unop_u2u64)
+            emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
+         else
+            emit_asm(ir, TGSI_OPCODE_AND, temp_dst, op[0], st_src_reg_for_int(1));
+      }
        break;
     }
+   case ir_unop_i642i:
+   case ir_unop_u642i:
+   case ir_unop_u642u:
+   case ir_unop_i642u: {
+      st_src_reg temp = get_temp(glsl_type::uvec4_type);
+      st_dst_reg temp_dst = st_dst_reg(temp);
+      unsigned orig_swz = op[0].swizzle;
+      unsigned orig_idx = op[0].index;
+      int el;
+      temp_dst.writemask = WRITEMASK_X;
  
+      for (el = 0; el < vector_elements; el++) {
+         unsigned swz = GET_SWZ(orig_swz, el);
+         if (swz & 1)
+            op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z);
+         else
+            op[0].swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+         if (swz > 2)
+            op[0].index = orig_idx + 1;
+         op[0].type = GLSL_TYPE_UINT;
+         temp_dst.writemask = WRITEMASK_X << el;
+         emit_asm(ir, TGSI_OPCODE_MOV, temp_dst, op[0]);
+      }
+      result_src = temp;
+      if (ir->operation == ir_unop_u642u || ir->operation == ir_unop_i642u)
+         result_src.type = GLSL_TYPE_UINT;
+      else
+         result_src.type = GLSL_TYPE_INT;
+      break;
+   }
+   case ir_unop_i642b:
+      emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0], st_src_reg_for_int(0));
+      break;
+   case ir_unop_i642f:
+      emit_asm(ir, TGSI_OPCODE_I642F, result_dst, op[0]);
+      break;
+   case ir_unop_u642f:
+      emit_asm(ir, TGSI_OPCODE_U642F, result_dst, op[0]);
+      break;
+   case ir_unop_i642d:
+      emit_asm(ir, TGSI_OPCODE_I642D, result_dst, op[0]);
+      break;
+   case ir_unop_u642d:
+      emit_asm(ir, TGSI_OPCODE_U642D, result_dst, op[0]);
+      break;
+   case ir_unop_i2i64:
+      emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
+      break;
+   case ir_unop_f2i64:
+      emit_asm(ir, TGSI_OPCODE_F2I64, result_dst, op[0]);
+      break;
+   case ir_unop_d2i64:
+      emit_asm(ir, TGSI_OPCODE_D2I64, result_dst, op[0]);
+      break;
+   case ir_unop_i2u64:
+      emit_asm(ir, TGSI_OPCODE_I2I64, result_dst, op[0]);
+      break;
+   case ir_unop_f2u64:
+      emit_asm(ir, TGSI_OPCODE_F2U64, result_dst, op[0]);
+      break;
+   case ir_unop_d2u64:
+      emit_asm(ir, TGSI_OPCODE_D2U64, result_dst, op[0]);
+      break;
+      /* these might be needed */
     case ir_unop_pack_snorm_2x16:
     case ir_unop_pack_unorm_2x16:
     case ir_unop_pack_snorm_4x8:
@@ -2268,6 +2486,11 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
     case ir_unop_unpack_snorm_4x8:
     case ir_unop_unpack_unorm_4x8:
  
+   case ir_unop_unpack_sampler_2x32:
+   case ir_unop_pack_sampler_2x32:
+   case ir_unop_unpack_image_2x32:
+   case ir_unop_pack_image_2x32:
+
     case ir_quadop_vector:
     case ir_binop_vector_extract:
     case ir_triop_vector_insert:
@@ -2335,16 +2558,16 @@ glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
   * for patch inputs), so only the array element type is considered.
   */
  static bool
-is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
+is_inout_array(unsigned stage, ir_variable *var, bool *remove_array)
  {
     const glsl_type *type = var->type;
  
+   *remove_array = false;
+
     if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
         (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
        return false;
  
-   *is_2d = false;
-
     if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
          (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
          stage == MESA_SHADER_TESS_CTRL) &&
@@ -2353,18 +2576,29 @@ is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
           return false; /* a system value probably */
  
        type = var->type->fields.array;
-      *is_2d = true;
+      *remove_array = true;
     }
  
     return type->is_array() || type->is_matrix();
  }
  
+static unsigned
+st_translate_interp_loc(ir_variable *var)
+{
+   if (var->data.centroid)
+      return TGSI_INTERPOLATE_LOC_CENTROID;
+   else if (var->data.sample)
+      return TGSI_INTERPOLATE_LOC_SAMPLE;
+   else
+      return TGSI_INTERPOLATE_LOC_CENTER;
+}
+
  void
  glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
  {
     variable_storage *entry = find_variable_storage(ir->var);
     ir_variable *var = ir->var;
-   bool is_2d;
+   bool remove_array;
  
     if (!entry) {
        switch (var->data.mode) {
@@ -2373,7 +2607,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
                                                 var->data.param_index);
           this->variables.push_tail(entry);
           break;
-      case ir_var_shader_in:
+      case ir_var_shader_in: {
           /* The linker assigns locations for varyings and attributes,
            * including deprecated builtins (like gl_Color), user-assign
            * generic attributes (glBindVertexLocation), and
@@ -2381,62 +2615,104 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
            */
           assert(var->data.location != -1);
  
-         if (is_inout_array(shader->Stage, var, &is_2d)) {
-            struct array_decl *decl = &input_arrays[num_input_arrays];
+         const glsl_type *type_without_array = var->type->without_array();
+         struct inout_decl *decl = &inputs[num_inputs];
+         unsigned component = var->data.location_frac;
+         unsigned num_components;
+         num_inputs++;
+
+         if (type_without_array->is_64bit())
+            component = component / 2;
+         if (type_without_array->vector_elements)
+            num_components = type_without_array->vector_elements;
+         else
+            num_components = 4;
+
+         decl->mesa_index = var->data.location;
+         decl->interp = (glsl_interp_mode) var->data.interpolation;
+         decl->interp_loc = st_translate_interp_loc(var);
+         decl->base_type = type_without_array->base_type;
+         decl->usage_mask = u_bit_consecutive(component, num_components);
  
-            decl->mesa_index = var->data.location;
+         if (is_inout_array(shader->Stage, var, &remove_array)) {
              decl->array_id = num_input_arrays + 1;
-            if (is_2d) {
-               decl->array_size = type_size(var->type->fields.array);
-               decl->array_type = var->type->fields.array->without_array()->base_type;
-            } else {
-               decl->array_size = type_size(var->type);
-               decl->array_type = var->type->without_array()->base_type;
-            }
              num_input_arrays++;
-
-            entry = new(mem_ctx) variable_storage(var,
-                                                  PROGRAM_INPUT,
-                                                  var->data.location,
-                                                  decl->array_id);
-         }
-         else {
-            entry = new(mem_ctx) variable_storage(var,
-                                                  PROGRAM_INPUT,
-                                                  var->data.location);
+         } else {
+            decl->array_id = 0;
           }
+
+         if (remove_array)
+            decl->size = type_size(var->type->fields.array);
+         else
+            decl->size = type_size(var->type);
+
+         entry = new(mem_ctx) variable_storage(var,
+                                               PROGRAM_INPUT,
+                                               decl->mesa_index,
+                                               decl->array_id);
+         entry->component = component;
+
           this->variables.push_tail(entry);
           break;
-      case ir_var_shader_out:
+      }
+      case ir_var_shader_out: {
           assert(var->data.location != -1);
  
-         if (is_inout_array(shader->Stage, var, &is_2d)) {
-            struct array_decl *decl = &output_arrays[num_output_arrays];
+         const glsl_type *type_without_array = var->type->without_array();
+         struct inout_decl *decl = &outputs[num_outputs];
+         unsigned component = var->data.location_frac;
+         unsigned num_components;
+         num_outputs++;
+
+         if (type_without_array->is_64bit())
+            component = component / 2;
+         if (type_without_array->vector_elements)
+            num_components = type_without_array->vector_elements;
+         else
+            num_components = 4;
+
+         decl->mesa_index = var->data.location + FRAG_RESULT_MAX * var->data.index;
+         decl->base_type = type_without_array->base_type;
+         decl->usage_mask = u_bit_consecutive(component, num_components);
+         if (var->data.stream & (1u << 31)) {
+            decl->gs_out_streams = var->data.stream & ~(1u << 31);
+         } else {
+            assert(var->data.stream < 4);
+            decl->gs_out_streams = 0;
+            for (unsigned i = 0; i < num_components; ++i)
+               decl->gs_out_streams |= var->data.stream << (2 * (component + i));
+         }
  
-            decl->mesa_index = var->data.location;
+         if (is_inout_array(shader->Stage, var, &remove_array)) {
              decl->array_id = num_output_arrays + 1;
-            if (is_2d) {
-               decl->array_size = type_size(var->type->fields.array);
-               decl->array_type = var->type->fields.array->without_array()->base_type;
-            } else {
-               decl->array_size = type_size(var->type);
-               decl->array_type = var->type->without_array()->base_type;
-            }
              num_output_arrays++;
+         } else {
+            decl->array_id = 0;
+         }
  
+         if (remove_array)
+            decl->size = type_size(var->type->fields.array);
+         else
+            decl->size = type_size(var->type);
+
+         if (var->data.fb_fetch_output) {
+            st_dst_reg dst = st_dst_reg(get_temp(var->type));
+            st_src_reg src = st_src_reg(PROGRAM_OUTPUT, decl->mesa_index,
+                                        var->type, component, decl->array_id);
+            emit_asm(NULL, TGSI_OPCODE_FBFETCH, dst, src);
+            entry = new(mem_ctx) variable_storage(var, dst.file, dst.index,
+                                                  dst.array_id);
+         } else {
              entry = new(mem_ctx) variable_storage(var,
                                                    PROGRAM_OUTPUT,
-                                                  var->data.location,
+                                                  decl->mesa_index,
                                                    decl->array_id);
           }
-         else {
-            entry = new(mem_ctx) variable_storage(var,
-                                                  PROGRAM_OUTPUT,
-                                                  var->data.location
-                                                  + var->data.index);
-         }
+         entry->component = component;
+
           this->variables.push_tail(entry);
           break;
+      }
        case ir_var_system_value:
           entry = new(mem_ctx) variable_storage(var,
                                                 PROGRAM_SYSTEM_VALUE,
@@ -2446,7 +2722,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
        case ir_var_temporary:
           st_src_reg src = get_temp(var->type);
  
-         entry = new(mem_ctx) variable_storage(var, src.file, src.index);
+         entry = new(mem_ctx) variable_storage(var, src.file, src.index,
+                                               src.array_id);
           this->variables.push_tail(entry);
  
           break;
@@ -2458,8 +2735,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
        }
     }
  
-   this->result = st_src_reg(entry->file, entry->index, var->type);
-   this->result.array_id = entry->array_id;
+   this->result = st_src_reg(entry->file, entry->index, var->type,
+                             entry->component, entry->array_id);
     if (this->shader->Stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in && var->type->is_double())
        this->result.is_double_vertex_input = true;
     if (!native_integers)
@@ -2467,53 +2744,72 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
  }
  
  static void
-shrink_array_declarations(struct array_decl *arrays, unsigned count,
-                          GLbitfield64 usage_mask,
+shrink_array_declarations(struct inout_decl *decls, unsigned count,
+                          GLbitfield64* usage_mask,
                            GLbitfield64 double_usage_mask,
-                          GLbitfield patch_usage_mask)
+                          GLbitfield* patch_usage_mask)
  {
-   unsigned i, j;
+   unsigned i;
+   int j;
  
     /* Fix array declarations by removing unused array elements at both ends
      * of the arrays. For example, mat4[3] where only mat[1] is used.
      */
     for (i = 0; i < count; i++) {
-      struct array_decl *decl = &arrays[i];
+      struct inout_decl *decl = &decls[i];
+      if (!decl->array_id)
+         continue;
  
        /* Shrink the beginning. */
-      for (j = 0; j < decl->array_size; j++) {
+      for (j = 0; j < (int)decl->size; j++) {
           if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
-            if (patch_usage_mask &
+            if (*patch_usage_mask &
                  BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
                 break;
           }
           else {
-            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+            if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                 break;
              if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
                 break;
           }
  
           decl->mesa_index++;
-         decl->array_size--;
+         decl->size--;
           j--;
        }
  
        /* Shrink the end. */
-      for (j = decl->array_size-1; j >= 0; j--) {
+      for (j = decl->size-1; j >= 0; j--) {
           if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
-            if (patch_usage_mask &
+            if (*patch_usage_mask &
                  BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
                 break;
           }
           else {
-            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+            if (*usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
                 break;
              if (double_usage_mask & BITFIELD64_BIT(decl->mesa_index+j-1))
                 break;
           }
  
-         decl->array_size--;
+         decl->size--;
+      }
+
+      /* When not all entries of an array are accessed, we mark them as used
+       * here anyway, to ensure that the input/output mapping logic doesn't get
+       * confused.
+       *
+       * TODO This happens when an array isn't used via indirect access, which
+       * some game ports do (at least eON-based). There is an optimization
+       * opportunity here by replacing the array declaration with non-array
+       * declarations of those slots that are actually used.
+       */
+      for (j = 1; j < (int)decl->size; ++j) {
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0)
+            *patch_usage_mask |= BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j);
+         else
+            *usage_mask |= BITFIELD64_BIT(decl->mesa_index + j);
        }
     }
  }
@@ -2603,12 +2899,6 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
        }
     }
  
-   /* If the type is smaller than a vec4, replicate the last channel out. */
-   if (ir->type->is_scalar() || ir->type->is_vector())
-      src.swizzle = swizzle_for_size(ir->type->vector_elements);
-   else
-      src.swizzle = SWIZZLE_NOOP;
-
     /* Change the register type to the element type of the array. */
     src.type = ir->type->base_type;
  
@@ -2646,7 +2936,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_record *ir)
   * ir_dereference handler.
   */
  static st_dst_reg
-get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v)
+get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v, int *component)
  {
     /* The LHS must be a dereference.  If the LHS is a variable indexed array
      * access of a vector, it must be separated into a series conditional moves
@@ -2658,10 +2948,12 @@ get_assignment_lhs(ir_dereference *ir, glsl_to_tgsi_visitor *v)
        assert(!deref_array->array->type->is_vector());
     }
  
-   /* Use the rvalue deref handler for the most part.  We'll ignore
-    * swizzles in it and write swizzles using writemask, though.
+   /* Use the rvalue deref handler for the most part.  We write swizzles using
+    * the writemask, but we do extract the base component for enhanced layouts
+    * from the source swizzle.
      */
     ir->accept(v);
+   *component = GET_SWZ(v->result.swizzle, 0);
     return st_dst_reg(v->result);
  }
  
@@ -2788,7 +3080,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
                                       st_dst_reg *l, st_src_reg *r,
                                       st_src_reg *cond, bool cond_swap)
  {
-   if (type->base_type == GLSL_TYPE_STRUCT) {
+   if (type->is_record()) {
        for (unsigned int i = 0; i < type->length; i++) {
           emit_block_mov(ir, type->fields.structure[i].type, l, r,
                          cond, cond_swap);
@@ -2817,6 +3109,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
  
     assert(type->is_scalar() || type->is_vector());
  
+   l->type = type->base_type;
     r->type = type->base_type;
     if (cond) {
        st_src_reg l_src = st_src_reg(*l);
@@ -2836,7 +3129,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
     }
     l->index++;
     r->index++;
-   if (type->is_dual_slot_double()) {
+   if (type->is_dual_slot()) {
        l->index++;
        if (r->is_double_vertex_input == false)
          r->index++;
@@ -2846,53 +3139,48 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
  void
  glsl_to_tgsi_visitor::visit(ir_assignment *ir)
  {
+   int dst_component;
     st_dst_reg l;
     st_src_reg r;
  
     ir->rhs->accept(this);
     r = this->result;
  
-   l = get_assignment_lhs(ir->lhs, this);
+   l = get_assignment_lhs(ir->lhs, this, &dst_component);
  
-   /* FINISHME: This should really set to the correct maximal writemask for each
-    * FINISHME: component written (in the loops below).  This case can only
-    * FINISHME: occur for matrices, arrays, and structures.
-    */
-   if (ir->write_mask == 0) {
-      assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
-
-      if (ir->lhs->type->is_array() || ir->lhs->type->without_array()->is_matrix()) {
-         if (ir->lhs->type->without_array()->is_double()) {
-            switch (ir->lhs->type->without_array()->vector_elements) {
-            case 1:
-               l.writemask = WRITEMASK_X;
-               break;
-            case 2:
-               l.writemask = WRITEMASK_XY;
-               break;
-            case 3:
-               l.writemask = WRITEMASK_XYZ;
-               break;
-            case 4:
-               l.writemask = WRITEMASK_XYZW;
-               break;
-            }
-         } else
-            l.writemask = WRITEMASK_XYZW;
-      }
-   } else if (ir->lhs->type->is_scalar() &&
-              !ir->lhs->type->is_double() &&
-              ir->lhs->variable_referenced()->data.mode == ir_var_shader_out) {
-      /* FINISHME: This hack makes writing to gl_FragDepth, which lives in the
-       * FINISHME: W component of fragment shader output zero, work correctly.
-       */
-      l.writemask = WRITEMASK_XYZW;
-   } else {
+   {
        int swizzles[4];
        int first_enabled_chan = 0;
        int rhs_chan = 0;
+      ir_variable *variable = ir->lhs->variable_referenced();
+
+      if (shader->Stage == MESA_SHADER_FRAGMENT &&
+          variable->data.mode == ir_var_shader_out &&
+          (variable->data.location == FRAG_RESULT_DEPTH ||
+           variable->data.location == FRAG_RESULT_STENCIL)) {
+         assert(ir->lhs->type->is_scalar());
+         assert(ir->write_mask == WRITEMASK_X);
+
+         if (variable->data.location == FRAG_RESULT_DEPTH)
+            l.writemask = WRITEMASK_Z;
+         else {
+            assert(variable->data.location == FRAG_RESULT_STENCIL);
+            l.writemask = WRITEMASK_Y;
+         }
+      } else if (ir->write_mask == 0) {
+         assert(!ir->lhs->type->is_scalar() && !ir->lhs->type->is_vector());
  
-      l.writemask = ir->write_mask;
+         unsigned num_elements = ir->lhs->type->without_array()->vector_elements;
+
+         if (num_elements) {
+            l.writemask = u_bit_consecutive(0, num_elements);
+         } else {
+            /* The type is a struct or an array of (array of) structs. */
+            l.writemask = WRITEMASK_XYZW;
+         }
+      } else {
+         l.writemask = ir->write_mask;
+      }
  
        for (int i = 0; i < 4; i++) {
           if (l.writemask & (1 << i)) {
@@ -2901,6 +3189,8 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
           }
        }
  
+      l.writemask = l.writemask << dst_component;
+
        /* Swizzle a small RHS vector into the channels being written.
         *
         * glsl ir treats write_mask as dictating how many channels are
@@ -2928,6 +3218,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
     } else if (ir->rhs->as_expression() &&
                this->instructions.get_tail() &&
                ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
+              !((glsl_to_tgsi_instruction *)this->instructions.get_tail())->is_64bit_expanded &&
                type_size(ir->lhs->type) == 1 &&
                l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst[0].writemask) {
        /* To avoid emitting an extra MOV when assigning an expression to a
@@ -2962,7 +3253,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
      * aggregate constant and move each constant value into it.  If we
      * get lucky, copy propagation will eliminate the extra moves.
      */
-   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
+   if (ir->type->is_record()) {
        st_src_reg temp_base = get_temp(ir->type);
        st_dst_reg temp = st_dst_reg(temp_base);
  
@@ -3078,14 +3369,25 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
     case GLSL_TYPE_DOUBLE:
        gl_type = GL_DOUBLE;
        for (i = 0; i < ir->type->vector_elements; i++) {
-         values[i * 2].i = *(uint32_t *)&ir->value.d[i];
-         values[i * 2 + 1].i = *(((uint32_t *)&ir->value.d[i]) + 1);
+         memcpy(&values[i * 2], &ir->value.d[i], sizeof(double));
        }
        break;
-   case GLSL_TYPE_UINT:
-      gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
+   case GLSL_TYPE_INT64:
+      gl_type = GL_INT64_ARB;
        for (i = 0; i < ir->type->vector_elements; i++) {
-         if (native_integers)
+         memcpy(&values[i * 2], &ir->value.d[i], sizeof(int64_t));
+      }
+      break;
+   case GLSL_TYPE_UINT64:
+      gl_type = GL_UNSIGNED_INT64_ARB;
+      for (i = 0; i < ir->type->vector_elements; i++) {
+         memcpy(&values[i * 2], &ir->value.d[i], sizeof(uint64_t));
+      }
+      break;
+   case GLSL_TYPE_UINT:
+      gl_type = native_integers ? GL_UNSIGNED_INT : GL_FLOAT;
+      for (i = 0; i < ir->type->vector_elements; i++) {
+         if (native_integers)
              values[i].u = ir->value.u[i];
           else
              values[i].f = ir->value.u[i];
@@ -3118,46 +3420,9 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
                                       &this->result.swizzle);
  }
  
-function_entry *
-glsl_to_tgsi_visitor::get_function_signature(ir_function_signature *sig)
-{
-   foreach_in_list_use_after(function_entry, entry, &this->function_signatures) {
-      if (entry->sig == sig)
-         return entry;
-   }
-
-   entry = ralloc(mem_ctx, function_entry);
-   entry->sig = sig;
-   entry->sig_id = this->next_signature_id++;
-   entry->bgn_inst = NULL;
-
-   /* Allocate storage for all the parameters. */
-   foreach_in_list(ir_variable, param, &sig->parameters) {
-      variable_storage *storage;
-
-      storage = find_variable_storage(param);
-      assert(!storage);
-
-      st_src_reg src = get_temp(param->type);
-
-      storage = new(mem_ctx) variable_storage(param, src.file, src.index);
-      this->variables.push_tail(storage);
-   }
-
-   if (!sig->return_type->is_void()) {
-      entry->return_reg = get_temp(sig->return_type);
-   } else {
-      entry->return_reg = undef_src;
-   }
-
-   this->function_signatures.push_tail(entry);
-   return entry;
-}
-
  void
  glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
  {
-   const char *callee = ir->callee->function_name();
     exec_node *param = ir->actual_parameters.get_head();
     ir_dereference *deref = static_cast<ir_dereference *>(param);
     ir_variable *location = deref->variable_referenced();
@@ -3167,9 +3432,10 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
  
     /* Calculate the surface offset */
     st_src_reg offset;
-   unsigned array_size = 0, base = 0, index = 0;
+   unsigned array_size = 0, base = 0;
+   uint16_t index = 0;
  
-   get_deref_offsets(deref, &array_size, &base, &index, &offset);
+   get_deref_offsets(deref, &array_size, &base, &index, &offset, false);
  
     if (offset.file != PROGRAM_UNDEFINED) {
        emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
@@ -3186,12 +3452,12 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
  
     glsl_to_tgsi_instruction *inst;
  
-   if (!strcmp("__intrinsic_atomic_read", callee)) {
+   if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_read) {
        inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset);
-   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
+   } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_increment) {
        inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
                        st_src_reg_for_int(1));
-   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
+   } else if (ir->callee->intrinsic_id == ir_intrinsic_atomic_counter_predecrement) {
        inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
                        st_src_reg_for_int(-1));
        emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1));
@@ -3202,34 +3468,37 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
  
        st_src_reg data = this->result, data2 = undef_src;
        unsigned opcode;
-      if (!strcmp("__intrinsic_atomic_add", callee))
+      switch (ir->callee->intrinsic_id) {
+      case ir_intrinsic_atomic_counter_add:
           opcode = TGSI_OPCODE_ATOMUADD;
-      else if (!strcmp("__intrinsic_atomic_min", callee))
+         break;
+      case ir_intrinsic_atomic_counter_min:
           opcode = TGSI_OPCODE_ATOMIMIN;
-      else if (!strcmp("__intrinsic_atomic_max", callee))
+         break;
+      case ir_intrinsic_atomic_counter_max:
           opcode = TGSI_OPCODE_ATOMIMAX;
-      else if (!strcmp("__intrinsic_atomic_and", callee))
+         break;
+      case ir_intrinsic_atomic_counter_and:
           opcode = TGSI_OPCODE_ATOMAND;
-      else if (!strcmp("__intrinsic_atomic_or", callee))
+         break;
+      case ir_intrinsic_atomic_counter_or:
           opcode = TGSI_OPCODE_ATOMOR;
-      else if (!strcmp("__intrinsic_atomic_xor", callee))
+         break;
+      case ir_intrinsic_atomic_counter_xor:
           opcode = TGSI_OPCODE_ATOMXOR;
-      else if (!strcmp("__intrinsic_atomic_exchange", callee))
+         break;
+      case ir_intrinsic_atomic_counter_exchange:
           opcode = TGSI_OPCODE_ATOMXCHG;
-      else if (!strcmp("__intrinsic_atomic_comp_swap", callee)) {
+         break;
+      case ir_intrinsic_atomic_counter_comp_swap: {
           opcode = TGSI_OPCODE_ATOMCAS;
           param = param->get_next();
           val = ((ir_instruction *)param)->as_rvalue();
           val->accept(this);
           data2 = this->result;
-      } else if (!strcmp("__intrinsic_atomic_sub", callee)) {
-         opcode = TGSI_OPCODE_ATOMUADD;
-         st_src_reg res = get_temp(glsl_type::uvec4_type);
-         st_dst_reg dstres = st_dst_reg(res);
-         dstres.writemask = dst.writemask;
-         emit_asm(ir, TGSI_OPCODE_INEG, dstres, data);
-         data = res;
-      } else {
+         break;
+      }
+      default:
           assert(!"Unexpected intrinsic");
           return;
        }
@@ -3237,13 +3506,12 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
        inst = emit_asm(ir, opcode, dst, offset, data, data2);
     }
  
-   inst->buffer = buffer;
+   inst->resource = buffer;
  }
  
  void
  glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
  {
-   const char *callee = ir->callee->function_name();
     exec_node *param = ir->actual_parameters.get_head();
  
     ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
@@ -3261,9 +3529,9 @@ glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
  
     if (!const_block) {
        block->accept(this);
-      emit_arl(ir, sampler_reladdr, this->result);
        buffer.reladdr = ralloc(mem_ctx, st_src_reg);
-      memcpy(buffer.reladdr, &sampler_reladdr, sizeof(sampler_reladdr));
+      *buffer.reladdr = this->result;
+      emit_arl(ir, sampler_reladdr, this->result);
     }
  
     /* Calculate the surface offset */
@@ -3279,11 +3547,11 @@ glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
  
     glsl_to_tgsi_instruction *inst;
  
-   if (!strcmp("__intrinsic_load_ssbo", callee)) {
+   if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_load) {
        inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
        if (dst.type == GLSL_TYPE_BOOL)
           emit_asm(ir, TGSI_OPCODE_USNE, dst, st_src_reg(dst), st_src_reg_for_int(0));
-   } else if (!strcmp("__intrinsic_store_ssbo", callee)) {
+   } else if (ir->callee->intrinsic_id == ir_intrinsic_ssbo_store) {
        param = param->get_next();
        ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
        val->accept(this);
@@ -3302,27 +3570,36 @@ glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
  
        st_src_reg data = this->result, data2 = undef_src;
        unsigned opcode;
-      if (!strcmp("__intrinsic_atomic_add_ssbo", callee))
+      switch (ir->callee->intrinsic_id) {
+      case ir_intrinsic_ssbo_atomic_add:
           opcode = TGSI_OPCODE_ATOMUADD;
-      else if (!strcmp("__intrinsic_atomic_min_ssbo", callee))
+         break;
+      case ir_intrinsic_ssbo_atomic_min:
           opcode = TGSI_OPCODE_ATOMIMIN;
-      else if (!strcmp("__intrinsic_atomic_max_ssbo", callee))
+         break;
+      case ir_intrinsic_ssbo_atomic_max:
           opcode = TGSI_OPCODE_ATOMIMAX;
-      else if (!strcmp("__intrinsic_atomic_and_ssbo", callee))
+         break;
+      case ir_intrinsic_ssbo_atomic_and:
           opcode = TGSI_OPCODE_ATOMAND;
-      else if (!strcmp("__intrinsic_atomic_or_ssbo", callee))
+         break;
+      case ir_intrinsic_ssbo_atomic_or:
           opcode = TGSI_OPCODE_ATOMOR;
-      else if (!strcmp("__intrinsic_atomic_xor_ssbo", callee))
+         break;
+      case ir_intrinsic_ssbo_atomic_xor:
           opcode = TGSI_OPCODE_ATOMXOR;
-      else if (!strcmp("__intrinsic_atomic_exchange_ssbo", callee))
+         break;
+      case ir_intrinsic_ssbo_atomic_exchange:
           opcode = TGSI_OPCODE_ATOMXCHG;
-      else if (!strcmp("__intrinsic_atomic_comp_swap_ssbo", callee)) {
+         break;
+      case ir_intrinsic_ssbo_atomic_comp_swap:
           opcode = TGSI_OPCODE_ATOMCAS;
           param = param->get_next();
           val = ((ir_instruction *)param)->as_rvalue();
           val->accept(this);
           data2 = this->result;
-      } else {
+         break;
+      default:
           assert(!"Unexpected intrinsic");
           return;
        }
@@ -3342,53 +3619,65 @@ glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
      */
     unsigned op = inst->op;
     do {
-      inst->buffer = buffer;
+      inst->resource = buffer;
        if (access)
           inst->buffer_access = access->value.u[0];
+
+      if (inst == this->instructions.get_head_raw())
+         break;
        inst = (glsl_to_tgsi_instruction *)inst->get_prev();
-      if (inst->op == TGSI_OPCODE_UADD)
+
+      if (inst->op == TGSI_OPCODE_UADD) {
+         if (inst == this->instructions.get_head_raw())
+            break;
           inst = (glsl_to_tgsi_instruction *)inst->get_prev();
-   } while (inst && inst->buffer.file == PROGRAM_UNDEFINED && inst->op == op);
+      }
+   } while (inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
  }
  
  void
  glsl_to_tgsi_visitor::visit_membar_intrinsic(ir_call *ir)
  {
-   const char *callee = ir->callee->function_name();
-
-   if (!strcmp("__intrinsic_memory_barrier", callee))
+   switch (ir->callee->intrinsic_id) {
+   case ir_intrinsic_memory_barrier:
        emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
                 st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
                                    TGSI_MEMBAR_ATOMIC_BUFFER |
                                    TGSI_MEMBAR_SHADER_IMAGE |
                                    TGSI_MEMBAR_SHARED));
-   else if (!strcmp("__intrinsic_memory_barrier_atomic_counter", callee))
+      break;
+   case ir_intrinsic_memory_barrier_atomic_counter:
        emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
                 st_src_reg_for_int(TGSI_MEMBAR_ATOMIC_BUFFER));
-   else if (!strcmp("__intrinsic_memory_barrier_buffer", callee))
+      break;
+   case ir_intrinsic_memory_barrier_buffer:
        emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
                 st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER));
-   else if (!strcmp("__intrinsic_memory_barrier_image", callee))
+      break;
+   case ir_intrinsic_memory_barrier_image:
        emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
                 st_src_reg_for_int(TGSI_MEMBAR_SHADER_IMAGE));
-   else if (!strcmp("__intrinsic_memory_barrier_shared", callee))
+      break;
+   case ir_intrinsic_memory_barrier_shared:
        emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
                 st_src_reg_for_int(TGSI_MEMBAR_SHARED));
-   else if (!strcmp("__intrinsic_group_memory_barrier", callee))
+      break;
+   case ir_intrinsic_group_memory_barrier:
        emit_asm(ir, TGSI_OPCODE_MEMBAR, undef_dst,
                 st_src_reg_for_int(TGSI_MEMBAR_SHADER_BUFFER |
                                    TGSI_MEMBAR_ATOMIC_BUFFER |
                                    TGSI_MEMBAR_SHADER_IMAGE |
                                    TGSI_MEMBAR_SHARED |
                                    TGSI_MEMBAR_THREAD_GROUP));
-   else
+      break;
+   default:
        assert(!"Unexpected memory barrier intrinsic");
+   }
  }
  
  void
  glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
  {
-   const char *callee = ir->callee->function_name();
     exec_node *param = ir->actual_parameters.get_head();
  
     ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
@@ -3408,10 +3697,10 @@ glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
  
     glsl_to_tgsi_instruction *inst;
  
-   if (!strcmp("__intrinsic_load_shared", callee)) {
+   if (ir->callee->intrinsic_id == ir_intrinsic_shared_load) {
        inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, off);
-      inst->buffer = buffer;
-   } else if (!strcmp("__intrinsic_store_shared", callee)) {
+      inst->resource = buffer;
+   } else if (ir->callee->intrinsic_id == ir_intrinsic_shared_store) {
        param = param->get_next();
        ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
        val->accept(this);
@@ -3423,7 +3712,7 @@ glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
  
        dst.type = this->result.type;
        inst = emit_asm(ir, TGSI_OPCODE_STORE, dst, off, this->result);
-      inst->buffer = buffer;
+      inst->resource = buffer;
     } else {
        param = param->get_next();
        ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
@@ -3431,40 +3720,48 @@ glsl_to_tgsi_visitor::visit_shared_intrinsic(ir_call *ir)
  
        st_src_reg data = this->result, data2 = undef_src;
        unsigned opcode;
-      if (!strcmp("__intrinsic_atomic_add_shared", callee))
+      switch (ir->callee->intrinsic_id) {
+      case ir_intrinsic_shared_atomic_add:
           opcode = TGSI_OPCODE_ATOMUADD;
-      else if (!strcmp("__intrinsic_atomic_min_shared", callee))
+         break;
+      case ir_intrinsic_shared_atomic_min:
           opcode = TGSI_OPCODE_ATOMIMIN;
-      else if (!strcmp("__intrinsic_atomic_max_shared", callee))
+         break;
+      case ir_intrinsic_shared_atomic_max:
           opcode = TGSI_OPCODE_ATOMIMAX;
-      else if (!strcmp("__intrinsic_atomic_and_shared", callee))
+         break;
+      case ir_intrinsic_shared_atomic_and:
           opcode = TGSI_OPCODE_ATOMAND;
-      else if (!strcmp("__intrinsic_atomic_or_shared", callee))
+         break;
+      case ir_intrinsic_shared_atomic_or:
           opcode = TGSI_OPCODE_ATOMOR;
-      else if (!strcmp("__intrinsic_atomic_xor_shared", callee))
+         break;
+      case ir_intrinsic_shared_atomic_xor:
           opcode = TGSI_OPCODE_ATOMXOR;
-      else if (!strcmp("__intrinsic_atomic_exchange_shared", callee))
+         break;
+      case ir_intrinsic_shared_atomic_exchange:
           opcode = TGSI_OPCODE_ATOMXCHG;
-      else if (!strcmp("__intrinsic_atomic_comp_swap_shared", callee)) {
+         break;
+      case ir_intrinsic_shared_atomic_comp_swap:
           opcode = TGSI_OPCODE_ATOMCAS;
           param = param->get_next();
           val = ((ir_instruction *)param)->as_rvalue();
           val->accept(this);
           data2 = this->result;
-      } else {
+         break;
+      default:
           assert(!"Unexpected intrinsic");
           return;
        }
  
        inst = emit_asm(ir, opcode, dst, off, data, data2);
-      inst->buffer = buffer;
+      inst->resource = buffer;
     }
  }
  
  void
  glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
  {
-   const char *callee = ir->callee->function_name();
     exec_node *param = ir->actual_parameters.get_head();
  
     ir_dereference *img = (ir_dereference *)param;
@@ -3476,11 +3773,12 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
     st_src_reg image(PROGRAM_IMAGE, 0, GLSL_TYPE_UINT);
  
     get_deref_offsets(img, &sampler_array_size, &sampler_base,
-                     (unsigned int *)&image.index, &reladdr);
+                     (uint16_t*)&image.index, &reladdr, true);
+
     if (reladdr.file != PROGRAM_UNDEFINED) {
-      emit_arl(ir, sampler_reladdr, reladdr);
        image.reladdr = ralloc(mem_ctx, st_src_reg);
-      memcpy(image.reladdr, &sampler_reladdr, sizeof(reladdr));
+      *image.reladdr = reladdr;
+      emit_arl(ir, sampler_reladdr, reladdr);
     }
  
     st_dst_reg dst = undef_dst;
@@ -3492,16 +3790,16 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
  
     glsl_to_tgsi_instruction *inst;
  
-   if (!strcmp("__intrinsic_image_size", callee)) {
+   if (ir->callee->intrinsic_id == ir_intrinsic_image_size) {
        dst.writemask = WRITEMASK_XYZ;
        inst = emit_asm(ir, TGSI_OPCODE_RESQ, dst);
-   } else if (!strcmp("__intrinsic_image_samples", callee)) {
+   } else if (ir->callee->intrinsic_id == ir_intrinsic_image_samples) {
        st_src_reg res = get_temp(glsl_type::ivec4_type);
        st_dst_reg dstres = st_dst_reg(res);
        dstres.writemask = WRITEMASK_W;
-      emit_asm(ir, TGSI_OPCODE_RESQ, dstres);
+      inst = emit_asm(ir, TGSI_OPCODE_RESQ, dstres);
        res.swizzle = SWIZZLE_WWWW;
-      inst = emit_asm(ir, TGSI_OPCODE_MOV, dst, res);
+      emit_asm(ir, TGSI_OPCODE_MOV, dst, res);
     } else {
        st_src_reg arg1 = undef_src, arg2 = undef_src;
        st_src_reg coord;
@@ -3547,27 +3845,38 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
        assert(param->is_tail_sentinel());
  
        unsigned opcode;
-      if (!strcmp("__intrinsic_image_load", callee))
+      switch (ir->callee->intrinsic_id) {
+      case ir_intrinsic_image_load:
           opcode = TGSI_OPCODE_LOAD;
-      else if (!strcmp("__intrinsic_image_store", callee))
+         break;
+      case ir_intrinsic_image_store:
           opcode = TGSI_OPCODE_STORE;
-      else if (!strcmp("__intrinsic_image_atomic_add", callee))
+         break;
+      case ir_intrinsic_image_atomic_add:
           opcode = TGSI_OPCODE_ATOMUADD;
-      else if (!strcmp("__intrinsic_image_atomic_min", callee))
+         break;
+      case ir_intrinsic_image_atomic_min:
           opcode = TGSI_OPCODE_ATOMIMIN;
-      else if (!strcmp("__intrinsic_image_atomic_max", callee))
+         break;
+      case ir_intrinsic_image_atomic_max:
           opcode = TGSI_OPCODE_ATOMIMAX;
-      else if (!strcmp("__intrinsic_image_atomic_and", callee))
+         break;
+      case ir_intrinsic_image_atomic_and:
           opcode = TGSI_OPCODE_ATOMAND;
-      else if (!strcmp("__intrinsic_image_atomic_or", callee))
+         break;
+      case ir_intrinsic_image_atomic_or:
           opcode = TGSI_OPCODE_ATOMOR;
-      else if (!strcmp("__intrinsic_image_atomic_xor", callee))
+         break;
+      case ir_intrinsic_image_atomic_xor:
           opcode = TGSI_OPCODE_ATOMXOR;
-      else if (!strcmp("__intrinsic_image_atomic_exchange", callee))
+         break;
+      case ir_intrinsic_image_atomic_exchange:
           opcode = TGSI_OPCODE_ATOMXCHG;
-      else if (!strcmp("__intrinsic_image_atomic_comp_swap", callee))
+         break;
+      case ir_intrinsic_image_atomic_comp_swap:
           opcode = TGSI_OPCODE_ATOMCAS;
-      else {
+         break;
+      default:
           assert(!"Unexpected intrinsic");
           return;
        }
@@ -3577,206 +3886,157 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
           inst->dst[0].writemask = WRITEMASK_XYZW;
     }
  
-   inst->buffer = image;
+   inst->resource = image;
     inst->sampler_array_size = sampler_array_size;
     inst->sampler_base = sampler_base;
  
-   switch (type->sampler_dimensionality) {
-   case GLSL_SAMPLER_DIM_1D:
-      inst->tex_target = (type->sampler_array)
-         ? TEXTURE_1D_ARRAY_INDEX : TEXTURE_1D_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_2D:
-      inst->tex_target = (type->sampler_array)
-         ? TEXTURE_2D_ARRAY_INDEX : TEXTURE_2D_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_3D:
-      inst->tex_target = TEXTURE_3D_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_CUBE:
-      inst->tex_target = (type->sampler_array)
-         ? TEXTURE_CUBE_ARRAY_INDEX : TEXTURE_CUBE_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_RECT:
-      inst->tex_target = TEXTURE_RECT_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_BUF:
-      inst->tex_target = TEXTURE_BUFFER_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_EXTERNAL:
-      inst->tex_target = TEXTURE_EXTERNAL_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_MS:
-      inst->tex_target = (type->sampler_array)
-         ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX : TEXTURE_2D_MULTISAMPLE_INDEX;
-      break;
-   default:
-      assert(!"Should not get here.");
-   }
-
+   inst->tex_target = type->sampler_index();
     inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx),
           _mesa_get_shader_image_format(imgvar->data.image_format));
  
-   if (imgvar->data.image_coherent)
+   if (imgvar->data.memory_coherent)
        inst->buffer_access |= TGSI_MEMORY_COHERENT;
-   if (imgvar->data.image_restrict)
+   if (imgvar->data.memory_restrict)
        inst->buffer_access |= TGSI_MEMORY_RESTRICT;
-   if (imgvar->data.image_volatile)
+   if (imgvar->data.memory_volatile)
        inst->buffer_access |= TGSI_MEMORY_VOLATILE;
  }
  
+void
+glsl_to_tgsi_visitor::visit_generic_intrinsic(ir_call *ir, unsigned op)
+{
+   ir->return_deref->accept(this);
+   st_dst_reg dst = st_dst_reg(this->result);
+
+   st_src_reg src[4] = { undef_src, undef_src, undef_src, undef_src };
+   unsigned num_src = 0;
+   foreach_in_list(ir_rvalue, param, &ir->actual_parameters) {
+      assert(num_src < ARRAY_SIZE(src));
+
+      this->result.file = PROGRAM_UNDEFINED;
+      param->accept(this);
+      assert(this->result.file != PROGRAM_UNDEFINED);
+
+      src[num_src] = this->result;
+      num_src++;
+   }
+
+   emit_asm(ir, op, dst, src[0], src[1], src[2], src[3]);
+}
+
  void
  glsl_to_tgsi_visitor::visit(ir_call *ir)
  {
-   glsl_to_tgsi_instruction *call_inst;
     ir_function_signature *sig = ir->callee;
-   const char *callee = sig->function_name();
-   function_entry *entry;
-   int i;
  
     /* Filter out intrinsics */
-   if (!strcmp("__intrinsic_atomic_read", callee) ||
-       !strcmp("__intrinsic_atomic_increment", callee) ||
-       !strcmp("__intrinsic_atomic_predecrement", callee) ||
-       !strcmp("__intrinsic_atomic_add", callee) ||
-       !strcmp("__intrinsic_atomic_sub", callee) ||
-       !strcmp("__intrinsic_atomic_min", callee) ||
-       !strcmp("__intrinsic_atomic_max", callee) ||
-       !strcmp("__intrinsic_atomic_and", callee) ||
-       !strcmp("__intrinsic_atomic_or", callee) ||
-       !strcmp("__intrinsic_atomic_xor", callee) ||
-       !strcmp("__intrinsic_atomic_exchange", callee) ||
-       !strcmp("__intrinsic_atomic_comp_swap", callee)) {
+   switch (sig->intrinsic_id) {
+   case ir_intrinsic_atomic_counter_read:
+   case ir_intrinsic_atomic_counter_increment:
+   case ir_intrinsic_atomic_counter_predecrement:
+   case ir_intrinsic_atomic_counter_add:
+   case ir_intrinsic_atomic_counter_min:
+   case ir_intrinsic_atomic_counter_max:
+   case ir_intrinsic_atomic_counter_and:
+   case ir_intrinsic_atomic_counter_or:
+   case ir_intrinsic_atomic_counter_xor:
+   case ir_intrinsic_atomic_counter_exchange:
+   case ir_intrinsic_atomic_counter_comp_swap:
        visit_atomic_counter_intrinsic(ir);
        return;
-   }
  
-   if (!strcmp("__intrinsic_load_ssbo", callee) ||
-       !strcmp("__intrinsic_store_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_add_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_min_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_max_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_and_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_or_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_xor_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_exchange_ssbo", callee) ||
-       !strcmp("__intrinsic_atomic_comp_swap_ssbo", callee)) {
+   case ir_intrinsic_ssbo_load:
+   case ir_intrinsic_ssbo_store:
+   case ir_intrinsic_ssbo_atomic_add:
+   case ir_intrinsic_ssbo_atomic_min:
+   case ir_intrinsic_ssbo_atomic_max:
+   case ir_intrinsic_ssbo_atomic_and:
+   case ir_intrinsic_ssbo_atomic_or:
+   case ir_intrinsic_ssbo_atomic_xor:
+   case ir_intrinsic_ssbo_atomic_exchange:
+   case ir_intrinsic_ssbo_atomic_comp_swap:
        visit_ssbo_intrinsic(ir);
        return;
-   }
  
-   if (!strcmp("__intrinsic_memory_barrier", callee) ||
-       !strcmp("__intrinsic_memory_barrier_atomic_counter", callee) ||
-       !strcmp("__intrinsic_memory_barrier_buffer", callee) ||
-       !strcmp("__intrinsic_memory_barrier_image", callee) ||
-       !strcmp("__intrinsic_memory_barrier_shared", callee) ||
-       !strcmp("__intrinsic_group_memory_barrier", callee)) {
+   case ir_intrinsic_memory_barrier:
+   case ir_intrinsic_memory_barrier_atomic_counter:
+   case ir_intrinsic_memory_barrier_buffer:
+   case ir_intrinsic_memory_barrier_image:
+   case ir_intrinsic_memory_barrier_shared:
+   case ir_intrinsic_group_memory_barrier:
        visit_membar_intrinsic(ir);
        return;
-   }
  
-   if (!strcmp("__intrinsic_load_shared", callee) ||
-       !strcmp("__intrinsic_store_shared", callee) ||
-       !strcmp("__intrinsic_atomic_add_shared", callee) ||
-       !strcmp("__intrinsic_atomic_min_shared", callee) ||
-       !strcmp("__intrinsic_atomic_max_shared", callee) ||
-       !strcmp("__intrinsic_atomic_and_shared", callee) ||
-       !strcmp("__intrinsic_atomic_or_shared", callee) ||
-       !strcmp("__intrinsic_atomic_xor_shared", callee) ||
-       !strcmp("__intrinsic_atomic_exchange_shared", callee) ||
-       !strcmp("__intrinsic_atomic_comp_swap_shared", callee)) {
+   case ir_intrinsic_shared_load:
+   case ir_intrinsic_shared_store:
+   case ir_intrinsic_shared_atomic_add:
+   case ir_intrinsic_shared_atomic_min:
+   case ir_intrinsic_shared_atomic_max:
+   case ir_intrinsic_shared_atomic_and:
+   case ir_intrinsic_shared_atomic_or:
+   case ir_intrinsic_shared_atomic_xor:
+   case ir_intrinsic_shared_atomic_exchange:
+   case ir_intrinsic_shared_atomic_comp_swap:
        visit_shared_intrinsic(ir);
        return;
-   }
  
-   if (!strcmp("__intrinsic_image_load", callee) ||
-       !strcmp("__intrinsic_image_store", callee) ||
-       !strcmp("__intrinsic_image_atomic_add", callee) ||
-       !strcmp("__intrinsic_image_atomic_min", callee) ||
-       !strcmp("__intrinsic_image_atomic_max", callee) ||
-       !strcmp("__intrinsic_image_atomic_and", callee) ||
-       !strcmp("__intrinsic_image_atomic_or", callee) ||
-       !strcmp("__intrinsic_image_atomic_xor", callee) ||
-       !strcmp("__intrinsic_image_atomic_exchange", callee) ||
-       !strcmp("__intrinsic_image_atomic_comp_swap", callee) ||
-       !strcmp("__intrinsic_image_size", callee) ||
-       !strcmp("__intrinsic_image_samples", callee)) {
+   case ir_intrinsic_image_load:
+   case ir_intrinsic_image_store:
+   case ir_intrinsic_image_atomic_add:
+   case ir_intrinsic_image_atomic_min:
+   case ir_intrinsic_image_atomic_max:
+   case ir_intrinsic_image_atomic_and:
+   case ir_intrinsic_image_atomic_or:
+   case ir_intrinsic_image_atomic_xor:
+   case ir_intrinsic_image_atomic_exchange:
+   case ir_intrinsic_image_atomic_comp_swap:
+   case ir_intrinsic_image_size:
+   case ir_intrinsic_image_samples:
        visit_image_intrinsic(ir);
        return;
-   }
-
-   entry = get_function_signature(sig);
-   /* Process in parameters. */
-   foreach_two_lists(formal_node, &sig->parameters,
-                     actual_node, &ir->actual_parameters) {
-      ir_rvalue *param_rval = (ir_rvalue *) actual_node;
-      ir_variable *param = (ir_variable *) formal_node;
-
-      if (param->data.mode == ir_var_function_in ||
-          param->data.mode == ir_var_function_inout) {
-         variable_storage *storage = find_variable_storage(param);
-         assert(storage);
  
-         param_rval->accept(this);
-         st_src_reg r = this->result;
-
-         st_dst_reg l;
-         l.file = storage->file;
-         l.index = storage->index;
-         l.reladdr = NULL;
-         l.writemask = WRITEMASK_XYZW;
-
-         for (i = 0; i < type_size(param->type); i++) {
-            emit_asm(ir, TGSI_OPCODE_MOV, l, r);
-            l.index++;
-            r.index++;
-         }
-      }
-   }
-
-   /* Emit call instruction */
-   call_inst = emit_asm(ir, TGSI_OPCODE_CAL);
-   call_inst->function = entry;
-
-   /* Process out parameters. */
-   foreach_two_lists(formal_node, &sig->parameters,
-                     actual_node, &ir->actual_parameters) {
-      ir_rvalue *param_rval = (ir_rvalue *) actual_node;
-      ir_variable *param = (ir_variable *) formal_node;
-
-      if (param->data.mode == ir_var_function_out ||
-          param->data.mode == ir_var_function_inout) {
-         variable_storage *storage = find_variable_storage(param);
-         assert(storage);
-
-         st_src_reg r;
-         r.file = storage->file;
-         r.index = storage->index;
-         r.reladdr = NULL;
-         r.swizzle = SWIZZLE_NOOP;
-         r.negate = 0;
+   case ir_intrinsic_shader_clock:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_CLOCK);
+      return;
  
-         param_rval->accept(this);
-         st_dst_reg l = st_dst_reg(this->result);
+   case ir_intrinsic_vote_all:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ALL);
+      return;
+   case ir_intrinsic_vote_any:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_ANY);
+      return;
+   case ir_intrinsic_vote_eq:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_VOTE_EQ);
+      return;
+   case ir_intrinsic_ballot:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_BALLOT);
+      return;
+   case ir_intrinsic_read_first_invocation:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_READ_FIRST);
+      return;
+   case ir_intrinsic_read_invocation:
+      visit_generic_intrinsic(ir, TGSI_OPCODE_READ_INVOC);
+      return;
  
-         for (i = 0; i < type_size(param->type); i++) {
-            emit_asm(ir, TGSI_OPCODE_MOV, l, r);
-            l.index++;
-            r.index++;
-         }
-      }
+   case ir_intrinsic_invalid:
+   case ir_intrinsic_generic_load:
+   case ir_intrinsic_generic_store:
+   case ir_intrinsic_generic_atomic_add:
+   case ir_intrinsic_generic_atomic_and:
+   case ir_intrinsic_generic_atomic_or:
+   case ir_intrinsic_generic_atomic_xor:
+   case ir_intrinsic_generic_atomic_min:
+   case ir_intrinsic_generic_atomic_max:
+   case ir_intrinsic_generic_atomic_exchange:
+   case ir_intrinsic_generic_atomic_comp_swap:
+      unreachable("Invalid intrinsic");
     }
-
-   /* Process return value. */
-   this->result = entry->return_reg;
  }
  
  void
-glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *head,
-                                         ir_dereference *tail,
+glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *tail,
                                           unsigned *array_elements,
-                                         unsigned *base,
-                                         unsigned *index,
+                                         uint16_t *index,
                                           st_src_reg *indirect,
                                           unsigned *location)
  {
@@ -3786,7 +4046,7 @@ glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *head,
        const glsl_type *struct_type = deref_record->record->type;
        int field_index = deref_record->record->type->field_index(deref_record->field);
  
-      calc_deref_offsets(head, deref_record->record->as_dereference(), array_elements, base, index, indirect, location);
+      calc_deref_offsets(deref_record->record->as_dereference(), array_elements, index, indirect, location);
  
        assert(field_index >= 0);
        *location += struct_type->record_location_offset(field_index);
@@ -3823,7 +4083,7 @@ glsl_to_tgsi_visitor::calc_deref_offsets(ir_dereference *head,
  
        *array_elements *= deref_arr->array->type->length;
  
-      calc_deref_offsets(head, deref_arr->array->as_dereference(), array_elements, base, index, indirect, location);
+      calc_deref_offsets(deref_arr->array->as_dereference(), array_elements, index, indirect, location);
        break;
     }
     default:
@@ -3835,8 +4095,9 @@ void
  glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
                                          unsigned *array_size,
                                          unsigned *base,
-                                        unsigned *index,
-                                        st_src_reg *reladdr)
+                                        uint16_t *index,
+                                        st_src_reg *reladdr,
+                                        bool opaque)
  {
     GLuint shader = _mesa_program_enum_to_shader_stage(this->prog->Target);
     unsigned location = 0;
@@ -3850,7 +4111,7 @@ glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
  
     assert(var);
     location = var->data.location;
-   calc_deref_offsets(ir, ir, array_size, base, index, reladdr, &location);
+   calc_deref_offsets(ir, array_size, index, reladdr, &location);
  
     /*
      * If we end up with no indirect then adjust the base to the index,
@@ -3861,10 +4122,25 @@ glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
        *array_size = 1;
     }
  
-   if (location != 0xffffffff) {
-      *base += this->shader_program->UniformStorage[location].opaque[shader].index;
-      *index += this->shader_program->UniformStorage[location].opaque[shader].index;
+   if (opaque) {
+      assert(location != 0xffffffff);
+      *base += this->shader_program->data->UniformStorage[location].opaque[shader].index;
+      *index += this->shader_program->data->UniformStorage[location].opaque[shader].index;
+   }
+}
+
+st_src_reg
+glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset)
+{
+   if (offset.reladdr || offset.reladdr2) {
+      st_src_reg tmp = get_temp(glsl_type::ivec2_type);
+      st_dst_reg tmp_dst = st_dst_reg(tmp);
+      tmp_dst.writemask = WRITEMASK_XY;
+      emit_asm(NULL, TGSI_OPCODE_MOV, tmp_dst, offset);
+      return tmp;
     }
+
+   return offset;
  }
  
  void
@@ -3877,21 +4153,22 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
     glsl_to_tgsi_instruction *inst = NULL;
     unsigned opcode = TGSI_OPCODE_NOP;
     const glsl_type *sampler_type = ir->sampler->type;
-   unsigned sampler_array_size = 1, sampler_index = 0, sampler_base = 0;
-   bool is_cube_array = false;
+   unsigned sampler_array_size = 1, sampler_base = 0;
+   uint16_t sampler_index = 0;
+   bool is_cube_array = false, is_cube_shadow = false;
     unsigned i;
  
-   /* if we are a cube array sampler */
-   if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-        sampler_type->sampler_array)) {
-      is_cube_array = true;
+   /* if we are a cube array sampler or a cube shadow */
+   if (sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
+      is_cube_array = sampler_type->sampler_array;
+      is_cube_shadow = sampler_type->sampler_shadow;
     }
  
     if (ir->coordinate) {
        ir->coordinate->accept(this);
  
        /* Put our coords in a temp.  We'll need to modify them for shadow,
-       * projection, or LOD, so the only case we'd use it as is is if
+       * projection, or LOD, so the only case we'd use it as-is is if
         * we're doing plain old texturing.  The optimization passes on
         * glsl_to_tgsi_visitor should handle cleaning up our mess in that case.
         */
@@ -3911,18 +4188,18 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
      */
     result_src = get_temp(ir->type);
     result_dst = st_dst_reg(result_src);
+   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
  
     switch (ir->op) {
     case ir_tex:
-      opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 : TGSI_OPCODE_TEX;
+      opcode = (is_cube_array && ir->shadow_comparator) ? TGSI_OPCODE_TEX2 : TGSI_OPCODE_TEX;
        if (ir->offset) {
           ir->offset->accept(this);
           offset[0] = this->result;
        }
        break;
     case ir_txb:
-      if (is_cube_array ||
-          sampler_type == glsl_type::samplerCubeShadow_type) {
+      if (is_cube_array || is_cube_shadow) {
           opcode = TGSI_OPCODE_TXB2;
        }
        else {
@@ -3936,9 +4213,13 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
        }
        break;
     case ir_txl:
-      opcode = is_cube_array ? TGSI_OPCODE_TXL2 : TGSI_OPCODE_TXL;
-      ir->lod_info.lod->accept(this);
-      lod_info = this->result;
+      if (this->has_tex_txf_lz && ir->lod_info.lod->is_zero()) {
+         opcode = TGSI_OPCODE_TEX_LZ;
+      } else {
+         opcode = is_cube_array ? TGSI_OPCODE_TXL2 : TGSI_OPCODE_TXL;
+         ir->lod_info.lod->accept(this);
+         lod_info = this->result;
+      }
        if (ir->offset) {
           ir->offset->accept(this);
           offset[0] = this->result;
@@ -3966,9 +4247,13 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
        levels_src = get_temp(ir->type);
        break;
     case ir_txf:
-      opcode = TGSI_OPCODE_TXF;
-      ir->lod_info.lod->accept(this);
-      lod_info = this->result;
+      if (this->has_tex_txf_lz && ir->lod_info.lod->is_zero()) {
+         opcode = TGSI_OPCODE_TXF_LZ;
+      } else {
+         opcode = TGSI_OPCODE_TXF;
+         ir->lod_info.lod->accept(this);
+         lod_info = this->result;
+      }
        if (ir->offset) {
           ir->offset->accept(this);
           offset[0] = this->result;
@@ -3985,16 +4270,17 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
        component = this->result;
        if (ir->offset) {
           ir->offset->accept(this);
-         if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) {
+         if (ir->offset->type->is_array()) {
              const glsl_type *elt_type = ir->offset->type->fields.array;
              for (i = 0; i < ir->offset->type->length; i++) {
                 offset[i] = this->result;
                 offset[i].index += i * type_size(elt_type);
                 offset[i].type = elt_type->base_type;
                 offset[i].swizzle = swizzle_for_size(elt_type->vector_elements);
+               offset[i] = canonicalize_gather_offset(offset[i]);
              }
           } else {
-            offset[0] = this->result;
+            offset[0] = canonicalize_gather_offset(this->result);
           }
        }
        break;
@@ -4030,11 +4316,11 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
            * the shadow comparator value must also be projected.
            */
           st_src_reg tmp_src = coord;
-         if (ir->shadow_comparitor) {
+         if (ir->shadow_comparator) {
              /* Slot the shadow value in as the second to last component of the
               * coord.
               */
-            ir->shadow_comparitor->accept(this);
+            ir->shadow_comparator->accept(this);
  
              tmp_src = get_temp(glsl_type::vec4_type);
              st_dst_reg tmp_dst = st_dst_reg(tmp_src);
@@ -4061,11 +4347,11 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
      * comparator was put in the correct place (and projected) by the code,
      * above, that handles by-hand projection.
      */
-   if (ir->shadow_comparitor && (!ir->projector || opcode == TGSI_OPCODE_TXP)) {
+   if (ir->shadow_comparator && (!ir->projector || opcode == TGSI_OPCODE_TXP)) {
        /* Slot the shadow value in as the second to last component of the
         * coord.
         */
-      ir->shadow_comparitor->accept(this);
+      ir->shadow_comparator->accept(this);
  
        if (is_cube_array) {
           cube_sc = get_temp(glsl_type::float_type);
@@ -4100,7 +4386,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
     }
  
     get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
-                     &sampler_index, &reladdr);
+                     &sampler_index, &reladdr, true);
     if (reladdr.file != PROGRAM_UNDEFINED)
        emit_arl(ir, sampler_reladdr, reladdr);
  
@@ -4117,14 +4403,12 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
           inst = emit_asm(ir, opcode, result_dst, lod_info);
     } else if (opcode == TGSI_OPCODE_TXQS) {
        inst = emit_asm(ir, opcode, result_dst);
-   } else if (opcode == TGSI_OPCODE_TXF) {
-      inst = emit_asm(ir, opcode, result_dst, coord);
     } else if (opcode == TGSI_OPCODE_TXL2 || opcode == TGSI_OPCODE_TXB2) {
        inst = emit_asm(ir, opcode, result_dst, coord, lod_info);
     } else if (opcode == TGSI_OPCODE_TEX2) {
        inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
     } else if (opcode == TGSI_OPCODE_TG4) {
-      if (is_cube_array && ir->shadow_comparitor) {
+      if (is_cube_array && ir->shadow_comparator) {
           inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
        } else {
           inst = emit_asm(ir, opcode, result_dst, coord, component);
@@ -4132,57 +4416,28 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
     } else
        inst = emit_asm(ir, opcode, result_dst, coord);
  
-   if (ir->shadow_comparitor)
+   if (ir->shadow_comparator)
        inst->tex_shadow = GL_TRUE;
  
-   inst->sampler.index = sampler_index;
+   inst->resource.index = sampler_index;
     inst->sampler_array_size = sampler_array_size;
     inst->sampler_base = sampler_base;
  
     if (reladdr.file != PROGRAM_UNDEFINED) {
-      inst->sampler.reladdr = ralloc(mem_ctx, st_src_reg);
-      memcpy(inst->sampler.reladdr, &reladdr, sizeof(reladdr));
+      inst->resource.reladdr = ralloc(mem_ctx, st_src_reg);
+      memcpy(inst->resource.reladdr, &reladdr, sizeof(reladdr));
     }
  
     if (ir->offset) {
+      if (!inst->tex_offsets)
+         inst->tex_offsets = rzalloc_array(inst, st_src_reg, MAX_GLSL_TEXTURE_OFFSET);
+
        for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != PROGRAM_UNDEFINED; i++)
           inst->tex_offsets[i] = offset[i];
        inst->tex_offset_num_offset = i;
     }
  
-   switch (sampler_type->sampler_dimensionality) {
-   case GLSL_SAMPLER_DIM_1D:
-      inst->tex_target = (sampler_type->sampler_array)
-         ? TEXTURE_1D_ARRAY_INDEX : TEXTURE_1D_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_2D:
-      inst->tex_target = (sampler_type->sampler_array)
-         ? TEXTURE_2D_ARRAY_INDEX : TEXTURE_2D_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_3D:
-      inst->tex_target = TEXTURE_3D_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_CUBE:
-      inst->tex_target = (sampler_type->sampler_array)
-         ? TEXTURE_CUBE_ARRAY_INDEX : TEXTURE_CUBE_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_RECT:
-      inst->tex_target = TEXTURE_RECT_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_BUF:
-      inst->tex_target = TEXTURE_BUFFER_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_EXTERNAL:
-      inst->tex_target = TEXTURE_EXTERNAL_INDEX;
-      break;
-   case GLSL_SAMPLER_DIM_MS:
-      inst->tex_target = (sampler_type->sampler_array)
-         ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX : TEXTURE_2D_MULTISAMPLE_INDEX;
-      break;
-   default:
-      assert(!"Should not get here.");
-   }
-
+   inst->tex_target = sampler_type->sampler_index();
     inst->tex_type = ir->type->base_type;
  
     this->result = result_src;
@@ -4191,23 +4446,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
  void
  glsl_to_tgsi_visitor::visit(ir_return *ir)
  {
-   if (ir->get_value()) {
-      st_dst_reg l;
-      int i;
-
-      assert(current_function);
-
-      ir->get_value()->accept(this);
-      st_src_reg r = this->result;
-
-      l = st_dst_reg(current_function->return_reg);
-
-      for (i = 0; i < type_size(current_function->sig->return_type); i++) {
-         emit_asm(ir, TGSI_OPCODE_MOV, l, r);
-         l.index++;
-         r.index++;
-      }
-   }
+   assert(!ir->get_value());
  
     emit_asm(ir, TGSI_OPCODE_RET);
  }
@@ -4297,11 +4536,11 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
     array_sizes = NULL;
     max_num_arrays = 0;
     next_array = 0;
+   num_inputs = 0;
+   num_outputs = 0;
     num_input_arrays = 0;
     num_output_arrays = 0;
-   next_signature_id = 1;
     num_immediates = 0;
-   current_function = NULL;
     num_address_regs = 0;
     samplers_used = 0;
     buffers_used = 0;
@@ -4319,6 +4558,7 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
     have_sqrt = false;
     have_fma = false;
     use_shared_memory = false;
+   has_tex_txf_lz = false;
  }
  
  glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
@@ -4356,19 +4596,23 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
                 st_translate_texture_target(inst->tex_target, inst->tex_shadow);
  
              if (inst->tex_shadow) {
-               prog->ShadowSamplers |= 1 << (inst->sampler.index + i);
+               prog->ShadowSamplers |= 1 << (inst->resource.index + i);
              }
           }
        }
-      if (inst->buffer.file != PROGRAM_UNDEFINED && (
+
+      if (inst->tex_target == TEXTURE_EXTERNAL_INDEX)
+         prog->ExternalSamplersUsed |= 1 << inst->resource.index;
+
+      if (inst->resource.file != PROGRAM_UNDEFINED && (
                  is_resource_instruction(inst->op) ||
                  inst->op == TGSI_OPCODE_STORE)) {
-         if (inst->buffer.file == PROGRAM_BUFFER) {
-            v->buffers_used |= 1 << inst->buffer.index;
-         } else if (inst->buffer.file == PROGRAM_MEMORY) {
+         if (inst->resource.file == PROGRAM_BUFFER) {
+            v->buffers_used |= 1 << inst->resource.index;
+         } else if (inst->resource.file == PROGRAM_MEMORY) {
              v->use_shared_memory = true;
           } else {
-            assert(inst->buffer.file == PROGRAM_IMAGE);
+            assert(inst->resource.file == PROGRAM_IMAGE);
              for (int i = 0; i < inst->sampler_array_size; i++) {
                 unsigned idx = inst->sampler_base + i;
                 v->images_used |= 1 << idx;
@@ -4443,11 +4687,9 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
        /* Give up if we encounter relative addressing or flow control. */
        if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
            inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
-          tgsi_get_opcode_info(inst->op)->is_branch ||
-          inst->op == TGSI_OPCODE_BGNSUB ||
+          inst->info->is_branch ||
            inst->op == TGSI_OPCODE_CONT ||
            inst->op == TGSI_OPCODE_END ||
-          inst->op == TGSI_OPCODE_ENDSUB ||
            inst->op == TGSI_OPCODE_RET) {
           break;
        }
@@ -4484,6 +4726,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
            && inst->dst[0].writemask == get_src_arg_mask(inst->dst[0], inst->src[2])) {
  
           inst->op = TGSI_OPCODE_MOV;
+         inst->info = tgsi_get_opcode_info(inst->op);
           inst->src[0] = inst->src[1];
        }
     }
@@ -4521,6 +4764,33 @@ glsl_to_tgsi_visitor::rename_temp_registers(int num_renames, struct rename_reg_p
     }
  }
  
+void
+glsl_to_tgsi_visitor::get_first_temp_write(int *first_writes)
+{
+   int depth = 0; /* loop depth */
+   int loop_start = -1; /* index of the first active BGNLOOP (if any) */
+   unsigned i = 0, j;
+
+   foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
+      for (j = 0; j < num_inst_dst_regs(inst); j++) {
+         if (inst->dst[j].file == PROGRAM_TEMPORARY) {
+            if (first_writes[inst->dst[j].index] == -1)
+                first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
+         }
+      }
+
+      if (inst->op == TGSI_OPCODE_BGNLOOP) {
+         if(depth++ == 0)
+            loop_start = i;
+      } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
+         if (--depth == 0)
+            loop_start = -1;
+      }
+      assert(depth >= 0);
+      i++;
+   }
+}
+
  void
  glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
  {
@@ -4810,7 +5080,8 @@ glsl_to_tgsi_visitor::copy_propagate(void)
            inst->src[0].file != PROGRAM_ARRAY &&
            !inst->src[0].reladdr &&
            !inst->src[0].reladdr2 &&
-          !inst->src[0].negate) {
+          !inst->src[0].negate &&
+          !inst->src[0].abs) {
           for (int i = 0; i < 4; i++) {
              if (inst->dst[0].writemask & (1 << i)) {
                 acp[4 * inst->dst[0].index + i] = inst;
@@ -4984,7 +5255,7 @@ glsl_to_tgsi_visitor::eliminate_dead_code(void)
           delete inst;
           removed++;
        } else {
-         if (inst->dst[0].type == GLSL_TYPE_DOUBLE) {
+         if (glsl_base_type_is_64bit(inst->dst[0].type)) {
              if (inst->dead_mask == WRITEMASK_XY ||
                  inst->dead_mask == WRITEMASK_ZW)
                 inst->dst[0].writemask &= ~(inst->dead_mask);
@@ -5050,8 +5321,8 @@ glsl_to_tgsi_visitor::merge_two_dsts(void)
  void
  glsl_to_tgsi_visitor::merge_registers(void)
  {
-   int *last_reads = rzalloc_array(mem_ctx, int, this->next_temp);
-   int *first_writes = rzalloc_array(mem_ctx, int, this->next_temp);
+   int *last_reads = ralloc_array(mem_ctx, int, this->next_temp);
+   int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
     struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
     int i, j;
     int num_renames = 0;
@@ -5109,16 +5380,17 @@ glsl_to_tgsi_visitor::renumber_registers(void)
  {
     int i = 0;
     int new_index = 0;
-   int *first_reads = rzalloc_array(mem_ctx, int, this->next_temp);
+   int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
     struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
     int num_renames = 0;
+
     for (i = 0; i < this->next_temp; i++) {
-      first_reads[i] = -1;
+      first_writes[i] = -1;
     }
-   get_first_temp_read(first_reads);
+   get_first_temp_write(first_writes);
  
     for (i = 0; i < this->next_temp; i++) {
-      if (first_reads[i] < 0) continue;
+      if (first_writes[i] < 0) continue;
        if (i != new_index) {
           renames[num_renames].old_reg = i;
           renames[num_renames].new_reg = new_index;
@@ -5130,14 +5402,10 @@ glsl_to_tgsi_visitor::renumber_registers(void)
     rename_temp_registers(num_renames, renames);
     this->next_temp = new_index;
     ralloc_free(renames);
-   ralloc_free(first_reads);
+   ralloc_free(first_writes);
  }
  
  /* ------------------------- TGSI conversion stuff -------------------------- */
-struct label {
-   unsigned branch_target;
-   unsigned token;
-};
  
  /**
   * Intermediate state used during shader translation.
@@ -5162,118 +5430,101 @@ struct st_translate {
     struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
     struct ureg_src systemValues[SYSTEM_VALUE_MAX];
     struct ureg_src shared_memory;
-   struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
     unsigned *array_sizes;
-   struct array_decl *input_arrays;
-   struct array_decl *output_arrays;
+   struct inout_decl *input_decls;
+   unsigned num_input_decls;
+   struct inout_decl *output_decls;
+   unsigned num_output_decls;
  
-   const GLuint *inputMapping;
-   const GLuint *outputMapping;
+   const ubyte *inputMapping;
+   const ubyte *outputMapping;
  
-   /* For every instruction that contains a label (eg CALL), keep
-    * details so that we can go back afterwards and emit the correct
-    * tgsi instruction number for each label.
-    */
-   struct label *labels;
-   unsigned labels_size;
-   unsigned labels_count;
-
-   /* Keep a record of the tgsi instruction number that each mesa
-    * instruction starts at, will be used to fix up labels after
-    * translation.
-    */
-   unsigned *insn;
-   unsigned insn_size;
-   unsigned insn_count;
-
-   unsigned procType;  /**< TGSI_PROCESSOR_VERTEX/FRAGMENT */
-
-   boolean error;
+   unsigned procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
  };
  
  /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
-const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
-   /* Vertex shader
-    */
-   TGSI_SEMANTIC_VERTEXID,
-   TGSI_SEMANTIC_INSTANCEID,
-   TGSI_SEMANTIC_VERTEXID_NOBASE,
-   TGSI_SEMANTIC_BASEVERTEX,
-   TGSI_SEMANTIC_BASEINSTANCE,
-   TGSI_SEMANTIC_DRAWID,
-
-   /* Geometry shader
-    */
-   TGSI_SEMANTIC_INVOCATIONID,
-
-   /* Fragment shader
-    */
-   TGSI_SEMANTIC_POSITION,
-   TGSI_SEMANTIC_FACE,
-   TGSI_SEMANTIC_SAMPLEID,
-   TGSI_SEMANTIC_SAMPLEPOS,
-   TGSI_SEMANTIC_SAMPLEMASK,
-   TGSI_SEMANTIC_HELPER_INVOCATION,
-
-   /* Tessellation shaders
-    */
-   TGSI_SEMANTIC_TESSCOORD,
-   TGSI_SEMANTIC_VERTICESIN,
-   TGSI_SEMANTIC_PRIMID,
-   TGSI_SEMANTIC_TESSOUTER,
-   TGSI_SEMANTIC_TESSINNER,
-
-   /* Compute shaders
-    */
-   TGSI_SEMANTIC_THREAD_ID,
-   TGSI_SEMANTIC_BLOCK_ID,
-   TGSI_SEMANTIC_GRID_SIZE,
-};
-
-/**
- * Make note of a branch to a label in the TGSI code.
- * After we've emitted all instructions, we'll go over the list
- * of labels built here and patch the TGSI code with the actual
- * location of each label.
- */
-static unsigned *get_label(struct st_translate *t, unsigned branch_target)
-{
-   unsigned i;
-
-   if (t->labels_count + 1 >= t->labels_size) {
-      t->labels_size = 1 << (util_logbase2(t->labels_size) + 1);
-      t->labels = (struct label *)realloc(t->labels,
-                                          t->labels_size * sizeof(struct label));
-      if (t->labels == NULL) {
-         static unsigned dummy;
-         t->error = TRUE;
-         return &dummy;
-      }
-   }
-
-   i = t->labels_count++;
-   t->labels[i].branch_target = branch_target;
-   return &t->labels[i].token;
-}
-
-/**
- * Called prior to emitting the TGSI code for each instruction.
- * Allocate additional space for instructions if needed.
- * Update the insn[] array so the next glsl_to_tgsi_instruction points to
- * the next TGSI instruction.
- */
-static void set_insn_start(struct st_translate *t, unsigned start)
+unsigned
+_mesa_sysval_to_semantic(unsigned sysval)
  {
-   if (t->insn_count + 1 >= t->insn_size) {
-      t->insn_size = 1 << (util_logbase2(t->insn_size) + 1);
-      t->insn = (unsigned *)realloc(t->insn, t->insn_size * sizeof(t->insn[0]));
-      if (t->insn == NULL) {
-         t->error = TRUE;
-         return;
-      }
+   switch (sysval) {
+   /* Vertex shader */
+   case SYSTEM_VALUE_VERTEX_ID:
+      return TGSI_SEMANTIC_VERTEXID;
+   case SYSTEM_VALUE_INSTANCE_ID:
+      return TGSI_SEMANTIC_INSTANCEID;
+   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      return TGSI_SEMANTIC_VERTEXID_NOBASE;
+   case SYSTEM_VALUE_BASE_VERTEX:
+      return TGSI_SEMANTIC_BASEVERTEX;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      return TGSI_SEMANTIC_BASEINSTANCE;
+   case SYSTEM_VALUE_DRAW_ID:
+      return TGSI_SEMANTIC_DRAWID;
+
+   /* Geometry shader */
+   case SYSTEM_VALUE_INVOCATION_ID:
+      return TGSI_SEMANTIC_INVOCATIONID;
+
+   /* Fragment shader */
+   case SYSTEM_VALUE_FRAG_COORD:
+      return TGSI_SEMANTIC_POSITION;
+   case SYSTEM_VALUE_FRONT_FACE:
+      return TGSI_SEMANTIC_FACE;
+   case SYSTEM_VALUE_SAMPLE_ID:
+      return TGSI_SEMANTIC_SAMPLEID;
+   case SYSTEM_VALUE_SAMPLE_POS:
+      return TGSI_SEMANTIC_SAMPLEPOS;
+   case SYSTEM_VALUE_SAMPLE_MASK_IN:
+      return TGSI_SEMANTIC_SAMPLEMASK;
+   case SYSTEM_VALUE_HELPER_INVOCATION:
+      return TGSI_SEMANTIC_HELPER_INVOCATION;
+
+   /* Tessellation shader */
+   case SYSTEM_VALUE_TESS_COORD:
+      return TGSI_SEMANTIC_TESSCOORD;
+   case SYSTEM_VALUE_VERTICES_IN:
+      return TGSI_SEMANTIC_VERTICESIN;
+   case SYSTEM_VALUE_PRIMITIVE_ID:
+      return TGSI_SEMANTIC_PRIMID;
+   case SYSTEM_VALUE_TESS_LEVEL_OUTER:
+      return TGSI_SEMANTIC_TESSOUTER;
+   case SYSTEM_VALUE_TESS_LEVEL_INNER:
+      return TGSI_SEMANTIC_TESSINNER;
+
+   /* Compute shader */
+   case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
+      return TGSI_SEMANTIC_THREAD_ID;
+   case SYSTEM_VALUE_WORK_GROUP_ID:
+      return TGSI_SEMANTIC_BLOCK_ID;
+   case SYSTEM_VALUE_NUM_WORK_GROUPS:
+      return TGSI_SEMANTIC_GRID_SIZE;
+   case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
+      return TGSI_SEMANTIC_BLOCK_SIZE;
+
+   /* ARB_shader_ballot */
+   case SYSTEM_VALUE_SUBGROUP_SIZE:
+      return TGSI_SEMANTIC_SUBGROUP_SIZE;
+   case SYSTEM_VALUE_SUBGROUP_INVOCATION:
+      return TGSI_SEMANTIC_SUBGROUP_INVOCATION;
+   case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
+      return TGSI_SEMANTIC_SUBGROUP_EQ_MASK;
+   case SYSTEM_VALUE_SUBGROUP_GE_MASK:
+      return TGSI_SEMANTIC_SUBGROUP_GE_MASK;
+   case SYSTEM_VALUE_SUBGROUP_GT_MASK:
+      return TGSI_SEMANTIC_SUBGROUP_GT_MASK;
+   case SYSTEM_VALUE_SUBGROUP_LE_MASK:
+      return TGSI_SEMANTIC_SUBGROUP_LE_MASK;
+   case SYSTEM_VALUE_SUBGROUP_LT_MASK:
+      return TGSI_SEMANTIC_SUBGROUP_LT_MASK;
+
+   /* Unhandled */
+   case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
+   case SYSTEM_VALUE_GLOBAL_INVOCATION_ID:
+   case SYSTEM_VALUE_VERTEX_CNT:
+   default:
+      assert(!"Unexpected SYSTEM_VALUE_ enum");
+      return TGSI_SEMANTIC_COUNT;
     }
-
-   t->insn[t->insn_count++] = start;
  }
  
  /**
@@ -5292,6 +5543,10 @@ emit_immediate(struct st_translate *t,
        return ureg_DECL_immediate(ureg, &values[0].f, size);
     case GL_DOUBLE:
        return ureg_DECL_immediate_f64(ureg, (double *)&values[0].f, size);
+   case GL_INT64_ARB:
+      return ureg_DECL_immediate_int64(ureg, (int64_t *)&values[0].f, size);
+   case GL_UNSIGNED_INT64_ARB:
+      return ureg_DECL_immediate_uint64(ureg, (uint64_t *)&values[0].f, size);
     case GL_INT:
        return ureg_DECL_immediate_int(ureg, &values[0].i, size);
     case GL_UNSIGNED_INT:
@@ -5319,7 +5574,7 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
     case PROGRAM_TEMPORARY:
        /* Allocate space for temporaries on demand. */
        if (index >= t->temps_size) {
-         const int inc = 4096;
+         const int inc = align(index - t->temps_size + 1, 4096);
  
           t->temps = (struct ureg_dst*)
                      realloc(t->temps,
@@ -5337,23 +5592,21 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
        return t->temps[index];
  
     case PROGRAM_ARRAY:
-      array = index >> 16;
-
-      assert(array < t->num_temp_arrays);
+      assert(array_id && array_id <= t->num_temp_arrays);
+      array = array_id - 1;
  
        if (ureg_dst_is_undef(t->arrays[array]))
           t->arrays[array] = ureg_DECL_array_temporary(
              t->ureg, t->array_sizes[array], TRUE);
  
-      return ureg_dst_array_offset(t->arrays[array],
-                                   (int)(index & 0xFFFF) - 0x8000);
+      return ureg_dst_array_offset(t->arrays[array], index);
  
     case PROGRAM_OUTPUT:
        if (!array_id) {
-         if (t->procType == TGSI_PROCESSOR_FRAGMENT)
-            assert(index < FRAG_RESULT_MAX);
-         else if (t->procType == TGSI_PROCESSOR_TESS_CTRL ||
-                  t->procType == TGSI_PROCESSOR_TESS_EVAL)
+         if (t->procType == PIPE_SHADER_FRAGMENT)
+            assert(index < 2 * FRAG_RESULT_MAX);
+         else if (t->procType == PIPE_SHADER_TESS_CTRL ||
+                  t->procType == PIPE_SHADER_TESS_EVAL)
              assert(index < VARYING_SLOT_TESS_MAX);
           else
              assert(index < VARYING_SLOT_MAX);
@@ -5363,13 +5616,15 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
           return t->outputs[t->outputMapping[index]];
        }
        else {
-         struct array_decl *decl = &t->output_arrays[array_id-1];
+         struct inout_decl *decl = find_inout_array(t->output_decls, t->num_output_decls, array_id);
           unsigned mesa_index = decl->mesa_index;
           int slot = t->outputMapping[mesa_index];
  
           assert(slot != -1 && t->outputs[slot].File == TGSI_FILE_OUTPUT);
-         assert(t->outputs[slot].ArrayID == array_id);
-         return ureg_dst_array_offset(t->outputs[slot], index - mesa_index);
+
+         struct ureg_dst dst = t->outputs[slot];
+         dst.ArrayID = array_id;
+         return ureg_dst_array_offset(dst, index - mesa_index);
        }
  
     case PROGRAM_ADDRESS:
@@ -5396,9 +5651,19 @@ src_register(struct st_translate *t, const st_src_reg *reg)
  
     case PROGRAM_TEMPORARY:
     case PROGRAM_ARRAY:
-   case PROGRAM_OUTPUT:
        return ureg_src(dst_register(t, reg->file, reg->index, reg->array_id));
  
+   case PROGRAM_OUTPUT: {
+      struct ureg_dst dst = dst_register(t, reg->file, reg->index, reg->array_id);
+      assert(dst.WriteMask != 0);
+      unsigned shift = ffs(dst.WriteMask) - 1;
+      return ureg_swizzle(ureg_src(dst),
+                          shift,
+                          MIN2(shift + 1, 3),
+                          MIN2(shift + 2, 3),
+                          MIN2(shift + 3, 3));
+   }
+
     case PROGRAM_UNIFORM:
        assert(reg->index >= 0);
        return reg->index < t->num_constants ?
@@ -5426,13 +5691,15 @@ src_register(struct st_translate *t, const st_src_reg *reg)
           return t->inputs[t->inputMapping[index] + double_reg2];
        }
        else {
-         struct array_decl *decl = &t->input_arrays[reg->array_id-1];
+         struct inout_decl *decl = find_inout_array(t->input_decls, t->num_input_decls, reg->array_id);
           unsigned mesa_index = decl->mesa_index;
           int slot = t->inputMapping[mesa_index];
  
           assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
-         assert(t->inputs[slot].ArrayID == reg->array_id);
-         return ureg_src_array_offset(t->inputs[slot], index + double_reg2 - mesa_index);
+
+         struct ureg_src src = t->inputs[slot];
+         src.ArrayID = reg->array_id;
+         return ureg_src_array_offset(src, index + double_reg2 - mesa_index);
        }
  
     case PROGRAM_ADDRESS:
@@ -5508,6 +5775,9 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
                        GET_SWZ(src_reg->swizzle, 2) & 0x3,
                        GET_SWZ(src_reg->swizzle, 3) & 0x3);
  
+   if (src_reg->abs)
+      src = ureg_abs(src);
+
     if ((src_reg->negate & 0xf) == NEGATE_XYZW)
        src = ureg_negate(src);
  
@@ -5521,51 +5791,24 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
  
  static struct tgsi_texture_offset
  translate_tex_offset(struct st_translate *t,
-                     const st_src_reg *in_offset, int idx)
+                     const st_src_reg *in_offset)
  {
     struct tgsi_texture_offset offset;
-   struct ureg_src imm_src;
-   struct ureg_dst dst;
-   int array;
-
-   switch (in_offset->file) {
-   case PROGRAM_IMMEDIATE:
-      assert(in_offset->index >= 0 && in_offset->index < t->num_immediates);
-      imm_src = t->immediates[in_offset->index];
+   struct ureg_src src = translate_src(t, in_offset);
  
-      offset.File = imm_src.File;
-      offset.Index = imm_src.Index;
-      offset.SwizzleX = imm_src.SwizzleX;
-      offset.SwizzleY = imm_src.SwizzleY;
-      offset.SwizzleZ = imm_src.SwizzleZ;
-      offset.Padding = 0;
-      break;
-   case PROGRAM_TEMPORARY:
-      imm_src = ureg_src(t->temps[in_offset->index]);
-      offset.File = imm_src.File;
-      offset.Index = imm_src.Index;
-      offset.SwizzleX = GET_SWZ(in_offset->swizzle, 0);
-      offset.SwizzleY = GET_SWZ(in_offset->swizzle, 1);
-      offset.SwizzleZ = GET_SWZ(in_offset->swizzle, 2);
-      offset.Padding = 0;
-      break;
-   case PROGRAM_ARRAY:
-      array = in_offset->index >> 16;
+   offset.File = src.File;
+   offset.Index = src.Index;
+   offset.SwizzleX = src.SwizzleX;
+   offset.SwizzleY = src.SwizzleY;
+   offset.SwizzleZ = src.SwizzleZ;
+   offset.Padding = 0;
  
-      assert(array >= 0);
-      assert(array < (int)t->num_temp_arrays);
+   assert(!src.Indirect);
+   assert(!src.DimIndirect);
+   assert(!src.Dimension);
+   assert(!src.Absolute); /* those shouldn't be used with integers anyway */
+   assert(!src.Negate);
  
-      dst = t->arrays[array];
-      offset.File = dst.File;
-      offset.Index = dst.Index + (in_offset->index & 0xFFFF) - 0x8000;
-      offset.SwizzleX = GET_SWZ(in_offset->swizzle, 0);
-      offset.SwizzleY = GET_SWZ(in_offset->swizzle, 1);
-      offset.SwizzleZ = GET_SWZ(in_offset->swizzle, 2);
-      offset.Padding = 0;
-      break;
-   default:
-      break;
-   }
     return offset;
  }
  
@@ -5596,20 +5839,16 @@ compile_tgsi_instruction(struct st_translate *t,
  
     switch(inst->op) {
     case TGSI_OPCODE_BGNLOOP:
-   case TGSI_OPCODE_CAL:
     case TGSI_OPCODE_ELSE:
     case TGSI_OPCODE_ENDLOOP:
     case TGSI_OPCODE_IF:
     case TGSI_OPCODE_UIF:
        assert(num_dst == 0);
-      ureg_label_insn(ureg,
-                      inst->op,
-                      src, num_src,
-                      get_label(t,
-                                inst->op == TGSI_OPCODE_CAL ? inst->function->sig_id : 0));
+      ureg_insn(ureg, inst->op, NULL, 0, src, num_src);
        return;
  
     case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TEX_LZ:
     case TGSI_OPCODE_TXB:
     case TGSI_OPCODE_TXD:
     case TGSI_OPCODE_TXL:
@@ -5617,19 +5856,20 @@ compile_tgsi_instruction(struct st_translate *t,
     case TGSI_OPCODE_TXQ:
     case TGSI_OPCODE_TXQS:
     case TGSI_OPCODE_TXF:
+   case TGSI_OPCODE_TXF_LZ:
     case TGSI_OPCODE_TEX2:
     case TGSI_OPCODE_TXB2:
     case TGSI_OPCODE_TXL2:
     case TGSI_OPCODE_TG4:
     case TGSI_OPCODE_LODQ:
-      src[num_src] = t->samplers[inst->sampler.index];
+      src[num_src] = t->samplers[inst->resource.index];
        assert(src[num_src].File != TGSI_FILE_NULL);
-      if (inst->sampler.reladdr)
+      if (inst->resource.reladdr)
           src[num_src] =
              ureg_src_indirect(src[num_src], ureg_src(t->address[2]));
        num_src++;
        for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
-         texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i], i);
+         texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
        }
        tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
  
@@ -5656,15 +5896,15 @@ compile_tgsi_instruction(struct st_translate *t,
        for (i = num_src - 1; i >= 0; i--)
           src[i + 1] = src[i];
        num_src++;
-      if (inst->buffer.file == PROGRAM_MEMORY) {
+      if (inst->resource.file == PROGRAM_MEMORY) {
           src[0] = t->shared_memory;
-      } else if (inst->buffer.file == PROGRAM_BUFFER) {
-         src[0] = t->buffers[inst->buffer.index];
+      } else if (inst->resource.file == PROGRAM_BUFFER) {
+         src[0] = t->buffers[inst->resource.index];
        } else {
-         src[0] = t->images[inst->buffer.index];
+         src[0] = t->images[inst->resource.index];
           tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
        }
-      if (inst->buffer.reladdr)
+      if (inst->resource.reladdr)
           src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2]));
        assert(src[0].File != TGSI_FILE_NULL);
        ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
@@ -5673,16 +5913,16 @@ compile_tgsi_instruction(struct st_translate *t,
        break;
  
     case TGSI_OPCODE_STORE:
-      if (inst->buffer.file == PROGRAM_MEMORY) {
+      if (inst->resource.file == PROGRAM_MEMORY) {
           dst[0] = ureg_dst(t->shared_memory);
-      } else if (inst->buffer.file == PROGRAM_BUFFER) {
-         dst[0] = ureg_dst(t->buffers[inst->buffer.index]);
+      } else if (inst->resource.file == PROGRAM_BUFFER) {
+         dst[0] = ureg_dst(t->buffers[inst->resource.index]);
        } else {
-         dst[0] = ureg_dst(t->images[inst->buffer.index]);
+         dst[0] = ureg_dst(t->images[inst->resource.index]);
           tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
        }
        dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
-      if (inst->buffer.reladdr)
+      if (inst->resource.reladdr)
           dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2]));
        assert(dst[0].File != TGSI_FILE_NULL);
        ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
@@ -5795,8 +6035,6 @@ emit_wpos(struct st_context *st,
            struct ureg_program *ureg,
            int wpos_transform_const)
  {
-   const struct gl_fragment_program *fp =
-      (const struct gl_fragment_program *) program;
     struct pipe_screen *pscreen = st->pipe->screen;
     GLfloat adjX = 0.0f;
     GLfloat adjY[2] = { 0.0f, 0.0f };
@@ -5807,7 +6045,7 @@ emit_wpos(struct st_context *st,
      *
      * The bias of the y-coordinate depends on whether y-inversion takes place
      * (adjY[1]) or not (adjY[0]), which is in turn dependent on whether we are
-    * drawing to an FBO (causes additional inversion), and whether the the pipe
+    * drawing to an FBO (causes additional inversion), and whether the pipe
      * driver origin and the requested origin differ (the latter condition is
      * stored in the 'invert' variable).
      *
@@ -5829,7 +6067,7 @@ emit_wpos(struct st_context *st,
      * u,i -> l,h: (99.0 + 0.5) * -1 + 100 = 0.5
      * u,h -> l,i: (99.5 + 0.5) * -1 + 100 = 0
      */
-   if (fp->OriginUpperLeft) {
+   if (program->OriginUpperLeft) {
        /* Fragment shader wants origin in upper-left */
        if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT)) {
           /* the driver supports upper-left origin */
@@ -5856,7 +6094,7 @@ emit_wpos(struct st_context *st,
           assert(0);
     }
  
-   if (fp->PixelCenterInteger) {
+   if (program->PixelCenterInteger) {
        /* Fragment shader wants pixel center integer */
        if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER)) {
           /* the driver supports pixel center integer */
@@ -5918,37 +6156,58 @@ emit_face_var(struct gl_context *ctx, struct st_translate *t)
     t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp);
  }
  
-static bool
-find_array(unsigned attr, struct array_decl *arrays, unsigned count,
-           unsigned *array_id, unsigned *array_size)
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) {
-      struct array_decl *decl = &arrays[i];
-
-      if (attr == decl->mesa_index) {
-         *array_id = decl->array_id;
-         *array_size = decl->array_size;
-         assert(*array_size);
-         return true;
-      }
-   }
-   return false;
-}
-
  static void
-emit_compute_block_size(const struct gl_program *program,
+emit_compute_block_size(const struct gl_program *prog,
                          struct ureg_program *ureg) {
-   const struct gl_compute_program *cp =
-      (const struct gl_compute_program *)program;
-
     ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH,
-                       cp->LocalSize[0]);
+                 prog->info.cs.local_size[0]);
     ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT,
-                       cp->LocalSize[1]);
+                 prog->info.cs.local_size[1]);
     ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH,
-                       cp->LocalSize[2]);
+                 prog->info.cs.local_size[2]);
+}
+
+struct sort_inout_decls {
+   bool operator()(const struct inout_decl &a, const struct inout_decl &b) const {
+      return mapping[a.mesa_index] < mapping[b.mesa_index];
+   }
+
+   const ubyte *mapping;
+};
+
+/* Sort the given array of decls by the corresponding slot (TGSI file index).
+ *
+ * This is for the benefit of older drivers which are broken when the
+ * declarations aren't sorted in this way.
+ */
+static void
+sort_inout_decls_by_slot(struct inout_decl *decls,
+                         unsigned count,
+                         const ubyte mapping[])
+{
+   sort_inout_decls sorter;
+   sorter.mapping = mapping;
+   std::sort(decls, decls + count, sorter);
+}
+
+static unsigned
+st_translate_interp(enum glsl_interp_mode glsl_qual, GLuint varying)
+{
+   switch (glsl_qual) {
+   case INTERP_MODE_NONE:
+      if (varying == VARYING_SLOT_COL0 || varying == VARYING_SLOT_COL1)
+         return TGSI_INTERPOLATE_COLOR;
+      return TGSI_INTERPOLATE_PERSPECTIVE;
+   case INTERP_MODE_SMOOTH:
+      return TGSI_INTERPOLATE_PERSPECTIVE;
+   case INTERP_MODE_FLAT:
+      return TGSI_INTERPOLATE_CONSTANT;
+   case INTERP_MODE_NOPERSPECTIVE:
+      return TGSI_INTERPOLATE_LINEAR;
+   default:
+      assert(0 && "unexpected interp mode in st_translate_interp()");
+      return TGSI_INTERPOLATE_PERSPECTIVE;
+   }
  }
  
  /**
@@ -5961,7 +6220,6 @@ emit_compute_block_size(const struct gl_program *program,
   * \param inputSemanticIndex  the semantic index (ex: which texcoord) for
   *                            each input
   * \param interpMode  the TGSI_INTERPOLATE_LINEAR/PERSP mode for each input
- * \param interpLocation the TGSI_INTERPOLATE_LOC_* location for each input
   * \param numOutputs  number of output registers used
   * \param outputMapping  maps Mesa fragment program outputs to TGSI
   *                       generic outputs
@@ -5979,15 +6237,13 @@ st_translate_program(
     glsl_to_tgsi_visitor *program,
     const struct gl_program *proginfo,
     GLuint numInputs,
-   const GLuint inputMapping[],
-   const GLuint inputSlotToAttr[],
+   const ubyte inputMapping[],
+   const ubyte inputSlotToAttr[],
     const ubyte inputSemanticName[],
     const ubyte inputSemanticIndex[],
-   const GLuint interpMode[],
-   const GLuint interpLocation[],
+   const ubyte interpMode[],
     GLuint numOutputs,
-   const GLuint outputMapping[],
-   const GLuint outputSlotToAttr[],
+   const ubyte outputMapping[],
     const ubyte outputSemanticName[],
     const ubyte outputSemanticIndex[])
  {
@@ -6000,35 +6256,6 @@ st_translate_program(
     assert(numInputs <= ARRAY_SIZE(t->inputs));
     assert(numOutputs <= ARRAY_SIZE(t->outputs));
  
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_FRONT_FACE] ==
-          TGSI_SEMANTIC_FACE);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_VERTEX_ID] ==
-          TGSI_SEMANTIC_VERTEXID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_INSTANCE_ID] ==
-          TGSI_SEMANTIC_INSTANCEID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_ID] ==
-          TGSI_SEMANTIC_SAMPLEID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_POS] ==
-          TGSI_SEMANTIC_SAMPLEPOS);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_MASK_IN] ==
-          TGSI_SEMANTIC_SAMPLEMASK);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_INVOCATION_ID] ==
-          TGSI_SEMANTIC_INVOCATIONID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE] ==
-          TGSI_SEMANTIC_VERTEXID_NOBASE);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_BASE_VERTEX] ==
-          TGSI_SEMANTIC_BASEVERTEX);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] ==
-          TGSI_SEMANTIC_TESSCOORD);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_HELPER_INVOCATION] ==
-          TGSI_SEMANTIC_HELPER_INVOCATION);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_LOCAL_INVOCATION_ID] ==
-          TGSI_SEMANTIC_THREAD_ID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_WORK_GROUP_ID] ==
-          TGSI_SEMANTIC_BLOCK_ID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_NUM_WORK_GROUPS] ==
-          TGSI_SEMANTIC_GRID_SIZE);
-
     t = CALLOC_STRUCT(st_translate);
     if (!t) {
        ret = PIPE_ERROR_OUT_OF_MEMORY;
@@ -6042,60 +6269,65 @@ st_translate_program(
     t->num_temp_arrays = program->next_array;
     if (t->num_temp_arrays)
        t->arrays = (struct ureg_dst*)
-                  calloc(1, sizeof(t->arrays[0]) * t->num_temp_arrays);
+                  calloc(t->num_temp_arrays, sizeof(t->arrays[0]));
  
     /*
      * Declare input attributes.
      */
     switch (procType) {
-   case TGSI_PROCESSOR_FRAGMENT:
-      for (i = 0; i < numInputs; i++) {
-         unsigned array_id = 0;
-         unsigned array_size;
-
-         if (find_array(inputSlotToAttr[i], program->input_arrays,
-                        program->num_input_arrays, &array_id, &array_size)) {
-            /* We've found an array. Declare it so. */
-            t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
-                              inputSemanticName[i], inputSemanticIndex[i],
-                              interpMode[i], 0, interpLocation[i],
-                              array_id, array_size);
-            i += array_size - 1;
-         }
-         else {
-            t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
-                              inputSemanticName[i], inputSemanticIndex[i],
-                              interpMode[i], 0, interpLocation[i], 0, 1);
+   case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_TESS_EVAL:
+   case PIPE_SHADER_TESS_CTRL:
+      sort_inout_decls_by_slot(program->inputs, program->num_inputs, inputMapping);
+
+      for (i = 0; i < program->num_inputs; ++i) {
+         struct inout_decl *decl = &program->inputs[i];
+         unsigned slot = inputMapping[decl->mesa_index];
+         struct ureg_src src;
+         ubyte tgsi_usage_mask = decl->usage_mask;
+
+         if (glsl_base_type_is_64bit(decl->base_type)) {
+            if (tgsi_usage_mask == 1)
+               tgsi_usage_mask = TGSI_WRITEMASK_XY;
+            else if (tgsi_usage_mask == 2)
+               tgsi_usage_mask = TGSI_WRITEMASK_ZW;
+            else
+               tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
           }
-      }
-      break;
-   case TGSI_PROCESSOR_GEOMETRY:
-   case TGSI_PROCESSOR_TESS_EVAL:
-   case TGSI_PROCESSOR_TESS_CTRL:
-      for (i = 0; i < numInputs; i++) {
-         unsigned array_id = 0;
-         unsigned array_size;
-
-         if (find_array(inputSlotToAttr[i], program->input_arrays,
-                        program->num_input_arrays, &array_id, &array_size)) {
-            /* We've found an array. Declare it so. */
-            t->inputs[i] = ureg_DECL_input(ureg, inputSemanticName[i],
-                                           inputSemanticIndex[i],
-                                           array_id, array_size);
-            i += array_size - 1;
+
+         unsigned interp_mode = 0;
+         unsigned interp_location = 0;
+         if (procType == PIPE_SHADER_FRAGMENT) {
+            assert(interpMode);
+            interp_mode = interpMode[slot] != TGSI_INTERPOLATE_COUNT ?
+               interpMode[slot] :
+               st_translate_interp(decl->interp, inputSlotToAttr[slot]);
+
+            interp_location = decl->interp_loc;
           }
-         else {
-            t->inputs[i] = ureg_DECL_input(ureg, inputSemanticName[i],
-                                           inputSemanticIndex[i], 0, 1);
+
+         src = ureg_DECL_fs_input_cyl_centroid_layout(ureg,
+                  inputSemanticName[slot], inputSemanticIndex[slot],
+                  interp_mode, 0, interp_location, slot, tgsi_usage_mask,
+                  decl->array_id, decl->size);
+
+         for (unsigned j = 0; j < decl->size; ++j) {
+            if (t->inputs[slot + j].File != TGSI_FILE_INPUT) {
+               /* The ArrayID is set up in dst_register */
+               t->inputs[slot + j] = src;
+               t->inputs[slot + j].ArrayID = 0;
+               t->inputs[slot + j].Index += j;
+            }
           }
        }
        break;
-   case TGSI_PROCESSOR_VERTEX:
+   case PIPE_SHADER_VERTEX:
        for (i = 0; i < numInputs; i++) {
           t->inputs[i] = ureg_DECL_vs_input(ureg, i);
        }
        break;
-   case TGSI_PROCESSOR_COMPUTE:
+   case PIPE_SHADER_COMPUTE:
        break;
     default:
        assert(0);
@@ -6105,30 +6337,42 @@ st_translate_program(
      * Declare output attributes.
      */
     switch (procType) {
-   case TGSI_PROCESSOR_FRAGMENT:
-   case TGSI_PROCESSOR_COMPUTE:
-      break;
-   case TGSI_PROCESSOR_GEOMETRY:
-   case TGSI_PROCESSOR_TESS_EVAL:
-   case TGSI_PROCESSOR_TESS_CTRL:
-   case TGSI_PROCESSOR_VERTEX:
-      for (i = 0; i < numOutputs; i++) {
-         unsigned array_id = 0;
-         unsigned array_size;
-
-         if (find_array(outputSlotToAttr[i], program->output_arrays,
-                        program->num_output_arrays, &array_id, &array_size)) {
-            /* We've found an array. Declare it so. */
-            t->outputs[i] = ureg_DECL_output_array(ureg,
-                                                   outputSemanticName[i],
-                                                   outputSemanticIndex[i],
-                                                   array_id, array_size);
-            i += array_size - 1;
+   case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_COMPUTE:
+      break;
+   case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_TESS_EVAL:
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_VERTEX:
+      sort_inout_decls_by_slot(program->outputs, program->num_outputs, outputMapping);
+
+      for (i = 0; i < program->num_outputs; ++i) {
+         struct inout_decl *decl = &program->outputs[i];
+         unsigned slot = outputMapping[decl->mesa_index];
+         struct ureg_dst dst;
+         ubyte tgsi_usage_mask = decl->usage_mask;
+
+         if (glsl_base_type_is_64bit(decl->base_type)) {
+            if (tgsi_usage_mask == 1)
+               tgsi_usage_mask = TGSI_WRITEMASK_XY;
+            else if (tgsi_usage_mask == 2)
+               tgsi_usage_mask = TGSI_WRITEMASK_ZW;
+            else
+               tgsi_usage_mask = TGSI_WRITEMASK_XYZW;
           }
-         else {
-            t->outputs[i] = ureg_DECL_output(ureg,
-                                             outputSemanticName[i],
-                                             outputSemanticIndex[i]);
+
+         dst = ureg_DECL_output_layout(ureg,
+                     outputSemanticName[slot], outputSemanticIndex[slot],
+                     decl->gs_out_streams,
+                     slot, tgsi_usage_mask, decl->array_id, decl->size);
+
+         for (unsigned j = 0; j < decl->size; ++j) {
+            if (t->outputs[slot + j].File != TGSI_FILE_OUTPUT) {
+               /* The ArrayID is set up in dst_register */
+               t->outputs[slot + j] = dst;
+               t->outputs[slot + j].ArrayID = 0;
+               t->outputs[slot + j].Index += j;
+            }
           }
        }
        break;
@@ -6136,17 +6380,17 @@ st_translate_program(
        assert(0);
     }
  
-   if (procType == TGSI_PROCESSOR_FRAGMENT) {
-      if (program->shader->EarlyFragmentTests)
+   if (procType == PIPE_SHADER_FRAGMENT) {
+      if (program->shader->Program->info.fs.early_fragment_tests)
           ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1);
  
-      if (proginfo->InputsRead & VARYING_BIT_POS) {
+      if (proginfo->info.inputs_read & VARYING_BIT_POS) {
            /* Must do this after setting up t->inputs. */
            emit_wpos(st_context(ctx), t, proginfo, ureg,
                      program->wpos_transform_const);
        }
  
-      if (proginfo->InputsRead & VARYING_BIT_FACE)
+      if (proginfo->info.inputs_read & VARYING_BIT_FACE)
           emit_face_var(ctx, t);
  
        for (i = 0; i < numOutputs; i++) {
@@ -6184,7 +6428,7 @@ st_translate_program(
           }
        }
     }
-   else if (procType == TGSI_PROCESSOR_VERTEX) {
+   else if (procType == PIPE_SHADER_VERTEX) {
        for (i = 0; i < numOutputs; i++) {
           if (outputSemanticName[i] == TGSI_SEMANTIC_FOG) {
              /* force register to contain a fog coordinate in the form (F, 0, 0, 1). */
@@ -6196,7 +6440,7 @@ st_translate_program(
        }
     }
  
-   if (procType == TGSI_PROCESSOR_COMPUTE) {
+   if (procType == PIPE_SHADER_COMPUTE) {
        emit_compute_block_size(proginfo, ureg);
     }
  
@@ -6211,11 +6455,11 @@ st_translate_program(
     /* Declare misc input registers
      */
     {
-      GLbitfield sysInputs = proginfo->SystemValuesRead;
+      GLbitfield sysInputs = proginfo->info.system_values_read;
  
        for (i = 0; sysInputs; i++) {
           if (sysInputs & (1 << i)) {
-            unsigned semName = _mesa_sysval_to_semantic[i];
+            unsigned semName = _mesa_sysval_to_semantic(i);
  
              t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
  
@@ -6230,7 +6474,7 @@ st_translate_program(
                  */
                 struct st_context *st = st_context(ctx);
                 struct pipe_screen *pscreen = st->pipe->screen;
-               assert(procType == TGSI_PROCESSOR_VERTEX);
+               assert(procType == PIPE_SHADER_VERTEX);
                 assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
                 (void) pscreen;
                 if (!ctx->Const.NativeIntegers) {
@@ -6240,7 +6484,7 @@ st_translate_program(
                 }
              }
  
-            if (procType == TGSI_PROCESSOR_FRAGMENT &&
+            if (procType == PIPE_SHADER_FRAGMENT &&
                  semName == TGSI_SEMANTIC_POSITION)
                 emit_wpos(st_context(ctx), t, proginfo, ureg,
                           program->wpos_transform_const);
@@ -6251,8 +6495,10 @@ st_translate_program(
     }
  
     t->array_sizes = program->array_sizes;
-   t->input_arrays = program->input_arrays;
-   t->output_arrays = program->output_arrays;
+   t->input_decls = program->inputs;
+   t->num_input_decls = program->num_inputs;
+   t->output_decls = program->outputs;
+   t->num_output_decls = program->num_outputs;
  
     /* Emit constants and uniforms.  TGSI uses a single index space for these,
      * so we put all the translated regs in t->constants.
@@ -6294,18 +6540,14 @@ st_translate_program(
        }
     }
  
-   if (program->shader) {
-      unsigned num_ubos = program->shader->NumUniformBlocks;
-
-      for (i = 0; i < num_ubos; i++) {
-         unsigned size = program->shader->UniformBlocks[i]->UniformBufferSize;
-         unsigned num_const_vecs = (size + 15) / 16;
-         unsigned first, last;
-         assert(num_const_vecs > 0);
-         first = 0;
-         last = num_const_vecs > 0 ? num_const_vecs - 1 : 0;
-         ureg_DECL_constant2D(t->ureg, first, last, i + 1);
-      }
+   for (i = 0; i < proginfo->info.num_ubos; i++) {
+      unsigned size = proginfo->sh.UniformBlocks[i]->UniformBufferSize;
+      unsigned num_const_vecs = (size + 15) / 16;
+      unsigned first, last;
+      assert(num_const_vecs > 0);
+      first = 0;
+      last = num_const_vecs > 0 ? num_const_vecs - 1 : 0;
+      ureg_DECL_constant2D(t->ureg, first, last, i + 1);
     }
  
     /* Emit immediate values.
@@ -6367,7 +6609,7 @@ st_translate_program(
     if (program->use_shared_memory)
        t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED);
  
-   for (i = 0; i < program->shader->NumImages; i++) {
+   for (i = 0; i < program->shader->Program->info.num_images; i++) {
        if (program->images_used & (1 << i)) {
           t->images[i] = ureg_DECL_image(ureg, i,
                                          program->image_targets[i],
@@ -6378,22 +6620,13 @@ st_translate_program(
  
     /* Emit each instruction in turn:
      */
-   foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions) {
-      set_insn_start(t, ureg_get_instruction_number(ureg));
+   foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions)
        compile_tgsi_instruction(t, inst);
-   }
-
-   /* Fix up all emitted labels:
-    */
-   for (i = 0; i < t->labels_count; i++) {
-      ureg_fixup_label(ureg, t->labels[i].token,
-                       t->insn[t->labels[i].branch_target]);
-   }
  
     /* Set the next shader stage hint for VS and TES. */
     switch (procType) {
-   case TGSI_PROCESSOR_VERTEX:
-   case TGSI_PROCESSOR_TESS_EVAL:
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
        if (program->shader_program->SeparateShader)
           break;
  
@@ -6403,16 +6636,16 @@ st_translate_program(
  
              switch (i) {
              case MESA_SHADER_TESS_CTRL:
-               next = TGSI_PROCESSOR_TESS_CTRL;
+               next = PIPE_SHADER_TESS_CTRL;
                 break;
              case MESA_SHADER_TESS_EVAL:
-               next = TGSI_PROCESSOR_TESS_EVAL;
+               next = PIPE_SHADER_TESS_EVAL;
                 break;
              case MESA_SHADER_GEOMETRY:
-               next = TGSI_PROCESSOR_GEOMETRY;
+               next = PIPE_SHADER_GEOMETRY;
                 break;
              case MESA_SHADER_FRAGMENT:
-               next = TGSI_PROCESSOR_FRAGMENT;
+               next = PIPE_SHADER_FRAGMENT;
                 break;
              default:
                 assert(0);
@@ -6430,17 +6663,10 @@ out:
     if (t) {
        free(t->arrays);
        free(t->temps);
-      free(t->insn);
-      free(t->labels);
        free(t->constants);
        t->num_constants = 0;
        free(t->immediates);
        t->num_immediates = 0;
-
-      if (t->error) {
-         debug_printf("%s: translate error flag set\n", __func__);
-      }
-
        FREE(t);
     }
  
@@ -6454,24 +6680,22 @@ out:
   * generating Mesa IR.
   */
  static struct gl_program *
-get_mesa_program(struct gl_context *ctx,
-                 struct gl_shader_program *shader_program,
-                 struct gl_shader *shader)
+get_mesa_program_tgsi(struct gl_context *ctx,
+                      struct gl_shader_program *shader_program,
+                      struct gl_linked_shader *shader)
  {
     glsl_to_tgsi_visitor* v;
     struct gl_program *prog;
-   GLenum target = _mesa_shader_stage_to_program(shader->Stage);
-   bool progress;
     struct gl_shader_compiler_options *options =
-         &ctx->Const.ShaderCompilerOptions[_mesa_shader_enum_to_shader_stage(shader->Type)];
+         &ctx->Const.ShaderCompilerOptions[shader->Stage];
     struct pipe_screen *pscreen = ctx->st->pipe->screen;
-   unsigned ptarget = st_shader_stage_to_ptarget(shader->Stage);
+   enum pipe_shader_type ptarget = st_shader_stage_to_ptarget(shader->Stage);
+   unsigned skip_merge_registers;
  
     validate_ir_tree(shader->ir);
  
-   prog = ctx->Driver.NewProgram(ctx, target, shader_program->Name);
-   if (!prog)
-      return NULL;
+   prog = shader->Program;
+
     prog->Parameters = _mesa_new_parameter_list();
     v = new glsl_to_tgsi_visitor();
     v->ctx = ctx;
@@ -6486,53 +6710,31 @@ get_mesa_program(struct gl_context *ctx,
                                              PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
     v->have_fma = pscreen->get_shader_param(pscreen, ptarget,
                                             PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED);
+   v->has_tex_txf_lz = pscreen->get_param(pscreen,
+                                          PIPE_CAP_TGSI_TEX_TXF_LZ);
+   skip_merge_registers =
+      pscreen->get_shader_param(pscreen, ptarget,
+                                PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS);
  
-   _mesa_copy_linked_program_data(shader->Stage, shader_program, prog);
     _mesa_generate_parameters_list_for_uniforms(shader_program, shader,
                                                 prog->Parameters);
  
     /* Remove reads from output registers. */
-   lower_output_reads(shader->Stage, shader->ir);
+   if (!pscreen->get_param(pscreen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS))
+      lower_output_reads(shader->Stage, shader->ir);
  
     /* Emit intermediate IR for main(). */
     visit_exec_list(shader->ir, v);
  
-   /* Now emit bodies for any functions that were used. */
-   do {
-      progress = GL_FALSE;
-
-      foreach_in_list(function_entry, entry, &v->function_signatures) {
-         if (!entry->bgn_inst) {
-            v->current_function = entry;
-
-            entry->bgn_inst = v->emit_asm(NULL, TGSI_OPCODE_BGNSUB);
-            entry->bgn_inst->function = entry;
-
-            visit_exec_list(&entry->sig->body, v);
-
-            glsl_to_tgsi_instruction *last;
-            last = (glsl_to_tgsi_instruction *)v->instructions.get_tail();
-            if (last->op != TGSI_OPCODE_RET)
-               v->emit_asm(NULL, TGSI_OPCODE_RET);
-
-            glsl_to_tgsi_instruction *end;
-            end = v->emit_asm(NULL, TGSI_OPCODE_ENDSUB);
-            end->function = entry;
-
-            progress = GL_TRUE;
-         }
-      }
-   } while (progress);
-
  #if 0
     /* Print out some information (for debugging purposes) used by the
      * optimization passes. */
     {
        int i;
-      int *first_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
-      int *first_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
-      int *last_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
-      int *last_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
+      int *first_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
+      int *first_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
+      int *last_writes = ralloc_array(v->mem_ctx, int, v->next_temp);
+      int *last_reads = ralloc_array(v->mem_ctx, int, v->next_temp);
  
        for (i = 0; i < v->next_temp; i++) {
           first_writes[i] = -1;
@@ -6558,14 +6760,15 @@ get_mesa_program(struct gl_context *ctx,
     /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
     v->simplify_cmp();
  
-   if (shader->Type != GL_TESS_CONTROL_SHADER &&
-       shader->Type != GL_TESS_EVALUATION_SHADER)
+   if (shader->Stage != MESA_SHADER_TESS_CTRL &&
+       shader->Stage != MESA_SHADER_TESS_EVAL)
        v->copy_propagate();
  
     while (v->eliminate_dead_code());
  
     v->merge_two_dsts();
-   v->merge_registers();
+   if (!skip_merge_registers)
+      v->merge_registers();
     v->renumber_registers();
  
     /* Write the END instruction. */
@@ -6580,14 +6783,15 @@ get_mesa_program(struct gl_context *ctx,
        _mesa_log("\n\n");
     }
  
-   prog->Instructions = NULL;
-   prog->NumInstructions = 0;
-
     do_set_program_inouts(shader->ir, prog, shader->Stage);
-   shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead, prog->DoubleInputsRead, prog->PatchInputsRead);
-   shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten, 0ULL, prog->PatchOutputsWritten);
+   _mesa_copy_linked_program_data(shader_program, shader);
+   shrink_array_declarations(v->inputs, v->num_inputs,
+                             &prog->info.inputs_read,
+                             prog->info.double_inputs_read,
+                             &prog->info.patch_inputs_read);
+   shrink_array_declarations(v->outputs, v->num_outputs,
+                             &prog->info.outputs_written, 0ULL,
+                             &prog->info.patch_outputs_written);
     count_resources(v, prog);
  
     /* The GLSL IR won't be needed anymore. */
@@ -6595,9 +6799,9 @@ get_mesa_program(struct gl_context *ctx,
     shader->ir = NULL;
  
     /* This must be done before the uniform storage is associated. */
-   if (shader->Type == GL_FRAGMENT_SHADER &&
-       (prog->InputsRead & VARYING_BIT_POS ||
-        prog->SystemValuesRead & (1 << SYSTEM_VALUE_FRAG_COORD))) {
+   if (shader->Stage == MESA_SHADER_FRAGMENT &&
+       (prog->info.inputs_read & VARYING_BIT_POS ||
+        prog->info.system_values_read & (1 << SYSTEM_VALUE_FRAG_COORD))) {
        static const gl_state_index wposTransformState[STATE_LENGTH] = {
           STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
        };
@@ -6606,8 +6810,6 @@ get_mesa_program(struct gl_context *ctx,
                                                            wposTransformState);
     }
  
-   _mesa_reference_program(ctx, &shader->Program, prog);
-
     /* Avoid reallocation of the program parameter list, because the uniform
      * storage is only associated with the original parameter list.
      * This should be enough for Bitmap and DrawPixels constants.
@@ -6618,41 +6820,43 @@ get_mesa_program(struct gl_context *ctx,
      * prog->ParameterValues to get reallocated (e.g., anything that adds a
      * program constant) has to happen before creating this linkage.
      */
-   _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters);
-   if (!shader_program->LinkStatus) {
+   _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters,
+                                   true);
+   if (!shader_program->data->LinkStatus) {
        free_glsl_to_tgsi_visitor(v);
+      _mesa_reference_program(ctx, &shader->Program, NULL);
        return NULL;
     }
  
     struct st_vertex_program *stvp;
     struct st_fragment_program *stfp;
-   struct st_geometry_program *stgp;
-   struct st_tessctrl_program *sttcp;
-   struct st_tesseval_program *sttep;
+   struct st_common_program *stgp;
+   struct st_common_program *sttcp;
+   struct st_common_program *sttep;
     struct st_compute_program *stcp;
  
-   switch (shader->Type) {
-   case GL_VERTEX_SHADER:
+   switch (shader->Stage) {
+   case MESA_SHADER_VERTEX:
        stvp = (struct st_vertex_program *)prog;
        stvp->glsl_to_tgsi = v;
        break;
-   case GL_FRAGMENT_SHADER:
+   case MESA_SHADER_FRAGMENT:
        stfp = (struct st_fragment_program *)prog;
        stfp->glsl_to_tgsi = v;
        break;
-   case GL_GEOMETRY_SHADER:
-      stgp = (struct st_geometry_program *)prog;
+   case MESA_SHADER_GEOMETRY:
+      stgp = (struct st_common_program *)prog;
        stgp->glsl_to_tgsi = v;
        break;
-   case GL_TESS_CONTROL_SHADER:
-      sttcp = (struct st_tessctrl_program *)prog;
+   case MESA_SHADER_TESS_CTRL:
+      sttcp = (struct st_common_program *)prog;
        sttcp->glsl_to_tgsi = v;
        break;
-   case GL_TESS_EVALUATION_SHADER:
-      sttep = (struct st_tesseval_program *)prog;
+   case MESA_SHADER_TESS_EVAL:
+      sttep = (struct st_common_program *)prog;
        sttep->glsl_to_tgsi = v;
        break;
-   case GL_COMPUTE_SHADER:
+   case MESA_SHADER_COMPUTE:
        stcp = (struct st_compute_program *)prog;
        stcp->glsl_to_tgsi = v;
        break;
@@ -6664,73 +6868,58 @@ get_mesa_program(struct gl_context *ctx,
     return prog;
  }
  
-extern "C" {
-
-static void
-st_dump_program_for_shader_db(struct gl_context *ctx,
-                              struct gl_shader_program *prog)
-{
-   /* Dump only successfully compiled and linked shaders to the specified
-    * file. This is for shader-db.
-    *
-    * These options allow some pre-processing of shaders while dumping,
-    * because some apps have ill-formed shaders.
-    */
-   const char *dump_filename = os_get_option("ST_DUMP_SHADERS");
-   const char *insert_directives = os_get_option("ST_DUMP_INSERT");
-
-   if (dump_filename && prog->Name != 0) {
-      FILE *f = fopen(dump_filename, "a");
-
-      if (f) {
-         for (unsigned i = 0; i < prog->NumShaders; i++) {
-            const struct gl_shader *sh = prog->Shaders[i];
-            const char *source;
-            bool skip_version = false;
-
-            if (!sh)
-               continue;
-
-            source = sh->Source;
-
-            /* This string mustn't be changed. shader-db uses it to find
-             * where the shader begins.
-             */
-            fprintf(f, "GLSL %s shader %d source for linked program %d:\n",
-                    _mesa_shader_stage_to_string(sh->Stage),
-                    i, prog->Name);
-
-            /* Dump the forced version if set. */
-            if (ctx->Const.ForceGLSLVersion) {
-               fprintf(f, "#version %i\n", ctx->Const.ForceGLSLVersion);
-               skip_version = true;
-            }
+/* See if there are unsupported control flow statements. */
+class ir_control_flow_info_visitor : public ir_hierarchical_visitor {
+private:
+   const struct gl_shader_compiler_options *options;
+public:
+   ir_control_flow_info_visitor(const struct gl_shader_compiler_options *options)
+      : options(options),
+        unsupported(false)
+   {
+   }
  
-            /* Insert directives (optional). */
-            if (insert_directives) {
-               if (!ctx->Const.ForceGLSLVersion && prog->Version)
-                  fprintf(f, "#version %i\n", prog->Version);
-               fprintf(f, "%s\n", insert_directives);
-               skip_version = true;
-            }
+   virtual ir_visitor_status visit_enter(ir_function *ir)
+   {
+      /* Other functions are skipped (same as glsl_to_tgsi). */
+      if (strcmp(ir->name, "main") == 0)
+         return visit_continue;
  
-            if (skip_version && strncmp(source, "#version ", 9) == 0) {
-               const char *next_line = strstr(source, "\n");
+      return visit_continue_with_parent;
+   }
  
-               if (next_line)
-                  source = next_line + 1;
-               else
-                  continue;
-            }
+   virtual ir_visitor_status visit_enter(ir_call *ir)
+   {
+      if (!ir->callee->is_intrinsic()) {
+         unsupported = true; /* it's a function call */
+         return visit_stop;
+      }
+      return visit_continue;
+   }
  
-            fprintf(f, "%s", source);
-            fprintf(f, "\n");
-         }
-         fclose(f);
+   virtual ir_visitor_status visit_enter(ir_return *ir)
+   {
+      if (options->EmitNoMainReturn) {
+         unsupported = true;
+         return visit_stop;
        }
+      return visit_continue;
     }
+
+   bool unsupported;
+};
+
+static bool
+has_unsupported_control_flow(exec_list *ir,
+                             const struct gl_shader_compiler_options *options)
+{
+   ir_control_flow_info_visitor visitor(options);
+   visit_list_elements(&visitor, ir);
+   return visitor.unsupported;
  }
  
+extern "C" {
+
  /**
   * Link a shader.
   * Called via ctx->Driver.LinkShader()
@@ -6740,36 +6929,46 @@ st_dump_program_for_shader_db(struct gl_context *ctx,
  GLboolean
  st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
  {
+   /* Return early if we are loading the shader from on-disk cache */
+   if (st_load_tgsi_from_disk_cache(ctx, prog)) {
+      return GL_TRUE;
+   }
+
     struct pipe_screen *pscreen = ctx->st->pipe->screen;
-   assert(prog->LinkStatus);
+   assert(prog->data->LinkStatus);
  
     for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
        if (prog->_LinkedShaders[i] == NULL)
           continue;
  
-      bool progress;
-      exec_list *ir = prog->_LinkedShaders[i]->ir;
-      gl_shader_stage stage = _mesa_shader_enum_to_shader_stage(prog->_LinkedShaders[i]->Type);
+      struct gl_linked_shader *shader = prog->_LinkedShaders[i];
+      exec_list *ir = shader->ir;
+      gl_shader_stage stage = shader->Stage;
        const struct gl_shader_compiler_options *options =
              &ctx->Const.ShaderCompilerOptions[stage];
-      unsigned ptarget = st_shader_stage_to_ptarget(stage);
+      enum pipe_shader_type ptarget = st_shader_stage_to_ptarget(stage);
        bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
                                                     PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
        bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
                                                     PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED);
+      unsigned if_threshold = pscreen->get_shader_param(pscreen, ptarget,
+                                                        PIPE_SHADER_CAP_LOWER_IF_THRESHOLD);
  
        /* If there are forms of indirect addressing that the driver
         * cannot handle, perform the lowering pass.
         */
        if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
            options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
-         lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
+         lower_variable_index_to_cond_assign(stage, ir,
                                               options->EmitNoIndirectInput,
                                               options->EmitNoIndirectOutput,
                                               options->EmitNoIndirectTemp,
                                               options->EmitNoIndirectUniform);
        }
  
+      if (!pscreen->get_param(pscreen, PIPE_CAP_INT64_DIVMOD))
+         lower_64bit_integer_instructions(ir, DIV64 | MOD64);
+
        if (ctx->Extensions.ARB_shading_language_packing) {
           unsigned lower_inst = LOWER_PACK_SNORM_2x16 |
                                 LOWER_UNPACK_SNORM_2x16 |
@@ -6793,9 +6992,13 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
        if (!pscreen->get_param(pscreen, PIPE_CAP_TEXTURE_GATHER_OFFSETS))
           lower_offset_arrays(ir);
        do_mat_op_to_vec(ir);
+
+      if (stage == MESA_SHADER_FRAGMENT)
+         lower_blend_equation_advanced(shader);
+
        lower_instructions(ir,
                           MOD_TO_FLOOR |
-                         DIV_TO_MUL_RCP |
+                         FDIV_TO_MUL_RCP |
                           EXP_TO_EXP2 |
                           LOG_TO_LOG2 |
                           LDEXP_TO_ARITH |
@@ -6805,7 +7008,22 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                           (have_dround ? 0 : DOPS_TO_DFRAC) |
                           (options->EmitNoPow ? POW_TO_EXP2 : 0) |
                           (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0) |
-                         (options->EmitNoSat ? SAT_TO_CLAMP : 0));
+                         (options->EmitNoSat ? SAT_TO_CLAMP : 0) |
+                         (ctx->Const.ForceGLSLAbsSqrt ? SQRT_TO_ABS_SQRT : 0) |
+                         /* Assume that if ARB_gpu_shader5 is not supported
+                          * then all of the extended integer functions need
+                          * lowering.  It may be necessary to add some caps
+                          * for individual instructions.
+                          */
+                         (!ctx->Extensions.ARB_gpu_shader5
+                          ? BIT_COUNT_TO_MATH |
+                            EXTRACT_TO_SHIFTS |
+                            INSERT_TO_SHIFTS |
+                            REVERSE_TO_SHIFTS |
+                            FIND_LSB_TO_FLOAT_CAST |
+                            FIND_MSB_TO_FLOAT_CAST |
+                            IMUL_HIGH_TO_MUL
+                          : 0));
  
        do_vec_index_to_cond_assign(ir);
        lower_vector_insert(ir, true);
@@ -6815,18 +7033,24 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
           lower_discard(ir);
        }
  
-      do {
-         progress = false;
-
-         progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress;
-
-         progress = do_common_optimization(ir, true, true, options,
-                                           ctx->Const.NativeIntegers)
-           || progress;
-
-         progress = lower_if_to_cond_assign(ir, options->MaxIfDepth) || progress;
-
-      } while (progress);
+      if (ctx->Const.GLSLOptimizeConservatively) {
+         /* Do it once and repeat only if there's unsupported control flow. */
+         do {
+            do_common_optimization(ir, true, true, options,
+                                   ctx->Const.NativeIntegers);
+            lower_if_to_cond_assign((gl_shader_stage)i, ir,
+                                    options->MaxIfDepth, if_threshold);
+         } while (has_unsupported_control_flow(ir, options));
+      } else {
+         /* Repeat it until it stops making changes. */
+         bool progress;
+         do {
+            progress = do_common_optimization(ir, true, true, options,
+                                              ctx->Const.NativeIntegers);
+            progress |= lower_if_to_cond_assign((gl_shader_stage)i, ir,
+                                                options->MaxIfDepth, if_threshold);
+         } while (progress);
+      }
  
        validate_ir_tree(ir);
     }
@@ -6834,41 +7058,64 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
     build_program_resource_list(ctx, prog);
  
     for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      struct gl_program *linked_prog;
-
-      if (prog->_LinkedShaders[i] == NULL)
+      struct gl_linked_shader *shader = prog->_LinkedShaders[i];
+      if (shader == NULL)
           continue;
  
-      linked_prog = get_mesa_program(ctx, prog, prog->_LinkedShaders[i]);
+      enum pipe_shader_type ptarget =
+         st_shader_stage_to_ptarget(shader->Stage);
+      enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir)
+         pscreen->get_shader_param(pscreen, ptarget,
+                                   PIPE_SHADER_CAP_PREFERRED_IR);
+
+      struct gl_program *linked_prog = NULL;
+      if (preferred_ir == PIPE_SHADER_IR_NIR) {
+         /* TODO only for GLSL VS/FS/CS for now: */
+         switch (shader->Stage) {
+         case MESA_SHADER_VERTEX:
+         case MESA_SHADER_FRAGMENT:
+         case MESA_SHADER_COMPUTE:
+            linked_prog = st_nir_get_mesa_program(ctx, prog, shader);
+         default:
+            break;
+         }
+      } else {
+         linked_prog = get_mesa_program_tgsi(ctx, prog, shader);
+      }
  
        if (linked_prog) {
-         _mesa_reference_program(ctx, &prog->_LinkedShaders[i]->Program,
-                                 linked_prog);
+         st_set_prog_affected_state_flags(linked_prog);
           if (!ctx->Driver.ProgramStringNotify(ctx,
                                                _mesa_shader_stage_to_program(i),
                                                linked_prog)) {
-            _mesa_reference_program(ctx, &prog->_LinkedShaders[i]->Program,
-                                    NULL);
-            _mesa_reference_program(ctx, &linked_prog, NULL);
+            _mesa_reference_program(ctx, &shader->Program, NULL);
              return GL_FALSE;
           }
        }
-
-      _mesa_reference_program(ctx, &linked_prog, NULL);
     }
  
-   st_dump_program_for_shader_db(ctx, prog);
     return GL_TRUE;
  }
  
  void
  st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
-                                const GLuint outputMapping[],
+                                const ubyte outputMapping[],
                                  struct pipe_stream_output_info *so)
  {
-   unsigned i;
+   if (!glsl_to_tgsi->shader_program->last_vert_prog)
+      return;
+
     struct gl_transform_feedback_info *info =
-      &glsl_to_tgsi->shader_program->LinkedTransformFeedback;
+      glsl_to_tgsi->shader_program->last_vert_prog->sh.LinkedTransformFeedback;
+   st_translate_stream_output_info2(info, outputMapping, so);
+}
+
+void
+st_translate_stream_output_info2(struct gl_transform_feedback_info *info,
+                                const ubyte outputMapping[],
+                                struct pipe_stream_output_info *so)
+{
+   unsigned i;
  
     for (i = 0; i < info->NumOutputs; i++) {
        so->output[i].register_index =