st/mesa: s/int/GLenum/ in st_glsl_to_tgsi.cpp

[mesa.git] / src / mesa / state_tracker / st_glsl_to_tgsi.cpp
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp

index 7ccb6e25d689438999ee4e6e2619b35d3c3c58a8..a1e37cfa8541b247925bec3d58b25f28622be830 100644 (file)
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -55,7 +55,7 @@
  #include "st_format.h"
  #include "st_nir.h"
  #include "st_shader_cache.h"
-#include "st_glsl_to_tgsi_private.h"
+#include "st_glsl_to_tgsi_temprename.h"
  
  #include "util/hash_table.h"
  #include <algorithm>
@@ -98,7 +98,7 @@ public:
  
  class immediate_storage : public exec_node {
  public:
-   immediate_storage(gl_constant_value *values, int size32, int type)
+   immediate_storage(gl_constant_value *values, int size32, GLenum type)
     {
        memcpy(this->values, values, size32 * sizeof(gl_constant_value));
        this->size32 = size32;
@@ -108,7 +108,7 @@ public:
     /* doubles are stored across 2 gl_constant_values */
     gl_constant_value values[4];
     int size32; /**< Number of 32-bit components (1-4) */
-   int type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
+   GLenum type; /**< GL_DOUBLE, GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
  };
  
  static const st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
@@ -152,6 +152,13 @@ find_array_type(struct inout_decl *decls, unsigned count, unsigned array_id)
     return GLSL_TYPE_ERROR;
  }
  
+struct hwatomic_decl {
+   unsigned location;
+   unsigned binding;
+   unsigned size;
+   unsigned array_id;
+};
+
  struct glsl_to_tgsi_visitor : public ir_visitor {
  public:
     glsl_to_tgsi_visitor();
@@ -176,28 +183,31 @@ public:
     unsigned num_outputs;
     unsigned num_output_arrays;
  
+   struct hwatomic_decl atomic_info[PIPE_MAX_HW_ATOMIC_BUFFERS];
+   unsigned num_atomics;
+   unsigned num_atomic_arrays;
     int num_address_regs;
     uint32_t samplers_used;
     glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
-   int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
+   enum tgsi_texture_type sampler_targets[PIPE_MAX_SAMPLERS];
     int images_used;
     int image_targets[PIPE_MAX_SHADER_IMAGES];
-   unsigned image_formats[PIPE_MAX_SHADER_IMAGES];
+   enum pipe_format image_formats[PIPE_MAX_SHADER_IMAGES];
     bool indirect_addr_consts;
     int wpos_transform_const;
  
-   int glsl_version;
     bool native_integers;
     bool have_sqrt;
     bool have_fma;
     bool use_shared_memory;
     bool has_tex_txf_lz;
     bool precise;
+   bool need_uarl;
  
     variable_storage *find_variable_storage(ir_variable *var);
  
     int add_constant(gl_register_file file, gl_constant_value values[8],
-                    int size, int datatype, uint16_t *swizzle_out);
+                    int size, GLenum datatype, uint16_t *swizzle_out);
  
     st_src_reg get_temp(const glsl_type *type);
     void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);
@@ -205,6 +215,7 @@ public:
     st_src_reg st_src_reg_for_double(double val);
     st_src_reg st_src_reg_for_float(float val);
     st_src_reg st_src_reg_for_int(int val);
+   st_src_reg st_src_reg_for_int64(int64_t val);
     st_src_reg st_src_reg_for_type(enum glsl_base_type type, int val);
  
     /**
@@ -352,7 +363,7 @@ fail_link(struct gl_shader_program *prog, const char *fmt, ...)
     ralloc_vasprintf_append(&prog->data->InfoLog, fmt, args);
     va_end(args);
  
-   prog->data->LinkStatus = linking_failure;
+   prog->data->LinkStatus = LINKING_FAILURE;
  }
  
  int
@@ -387,7 +398,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
      * sources into temps.
      */
     num_reladdr += dst.reladdr != NULL || dst.reladdr2;
-   num_reladdr += dst1.reladdr != NULL || dst1.reladdr2;
+   assert(!dst1.reladdr); /* should be lowered in earlier passes */
     num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
     num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
     num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
@@ -405,10 +416,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
           emit_arl(ir, address_reg2, *dst.reladdr2);
        num_reladdr--;
     }
-   if (dst1.reladdr) {
-      emit_arl(ir, address_reg, *dst1.reladdr);
-      num_reladdr--;
-   }
+
     assert(num_reladdr == 0);
  
     /* inst->op has only 8 bits. */
@@ -574,7 +582,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
                 if (swz > 1) {
                    dinst->src[j].double_reg2 = true;
                    dinst->src[j].index++;
-              }
+               }
  
                 if (swz & 1)
                    dinst->src[j].swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
@@ -583,10 +591,10 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
  
              } else {
                 /* some opcodes are special case in what they use as sources
-                  - [FUI]2D/[UI]2I64 is a float/[u]int src0, DLDEXP is integer src1 */
+                  - [FUI]2D/[UI]2I64 is a float/[u]int src0, (D)LDEXP is integer src1 */
                 if (op == TGSI_OPCODE_F2D || op == TGSI_OPCODE_U2D || op == TGSI_OPCODE_I2D ||
                     op == TGSI_OPCODE_I2I64 || op == TGSI_OPCODE_U2I64 ||
-                   op == TGSI_OPCODE_DLDEXP ||
+                   op == TGSI_OPCODE_DLDEXP || op == TGSI_OPCODE_LDEXP ||
                     (op == TGSI_OPCODE_UCMP && dst_is_64bit[0])) {
                    dinst->src[j].swizzle = MAKE_SWIZZLE4(swz, swz, swz, swz);
                 }
@@ -802,8 +810,12 @@ glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
  {
     int op = TGSI_OPCODE_ARL;
  
-   if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT)
+   if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT) {
+      if (!this->need_uarl && src0.is_legal_tgsi_address_operand())
+         return;
+
        op = TGSI_OPCODE_UARL;
+   }
  
     assert(dst.file == PROGRAM_ADDRESS);
     if (dst.index >= this->num_address_regs)
@@ -814,7 +826,8 @@ glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
  
  int
  glsl_to_tgsi_visitor::add_constant(gl_register_file file,
-                                   gl_constant_value values[8], int size, int datatype,
+                                   gl_constant_value values[8], int size,
+                                   GLenum datatype,
                                     uint16_t *swizzle_out)
  {
     if (file == PROGRAM_CONSTANT) {
@@ -908,6 +921,19 @@ glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
     return src;
  }
  
+st_src_reg
+glsl_to_tgsi_visitor::st_src_reg_for_int64(int64_t val)
+{
+   st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT64);
+   union gl_constant_value uval[2];
+
+   memcpy(uval, &val, sizeof(uval));
+   src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
+   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
+
+   return src;
+}
+
  st_src_reg
  glsl_to_tgsi_visitor::st_src_reg_for_type(enum glsl_base_type type, int val)
  {
@@ -930,6 +956,32 @@ type_size(const struct glsl_type *type)
     return type->count_attribute_slots(false);
  }
  
+static void
+add_buffer_to_load_and_stores(glsl_to_tgsi_instruction *inst, st_src_reg *buf,
+                              exec_list *instructions, ir_constant *access)
+{
+   /**
+    * emit_asm() might have actually split the op into pieces, e.g. for
+    * double stores. We have to go back and fix up all the generated ops.
+    */
+   unsigned op = inst->op;
+   do {
+      inst->resource = *buf;
+      if (access)
+         inst->buffer_access = access->value.u[0];
+
+      if (inst == instructions->get_head_raw())
+         break;
+      inst = (glsl_to_tgsi_instruction *)inst->get_prev();
+
+      if (inst->op == TGSI_OPCODE_UADD) {
+         if (inst == instructions->get_head_raw())
+            break;
+         inst = (glsl_to_tgsi_instruction *)inst->get_prev();
+      }
+   } while (inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
+}
+
  /**
   * If the given GLSL type is an array or matrix or a structure containing
   * an array/matrix member, return true.  Else return false.
@@ -1224,7 +1276,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
     if (reg->reladdr2) emit_arl(ir, address_reg2, *reg->reladdr2);
  
     if (*num_reladdr != 1) {
-      st_src_reg temp = get_temp(reg->type == GLSL_TYPE_DOUBLE ? glsl_type::dvec4_type : glsl_type::vec4_type);
+      st_src_reg temp = get_temp(glsl_type::get_instance(reg->type, 4, 1));
  
        emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
        *reg = temp;
@@ -1290,10 +1342,33 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
     st_dst_reg result_dst;
  
     int vector_elements = ir->operands[0]->type->vector_elements;
-   if (ir->operands[1]) {
+   if (ir->operands[1] &&
+       ir->operation != ir_binop_interpolate_at_offset &&
+       ir->operation != ir_binop_interpolate_at_sample) {
+      st_src_reg *swz_op = NULL;
+      if (vector_elements > ir->operands[1]->type->vector_elements) {
+         assert(ir->operands[1]->type->vector_elements == 1);
+         swz_op = &op[1];
+      } else if (vector_elements < ir->operands[1]->type->vector_elements) {
+         assert(ir->operands[0]->type->vector_elements == 1);
+         swz_op = &op[0];
+      }
+      if (swz_op) {
+         uint16_t swizzle_x = GET_SWZ(swz_op->swizzle, 0);
+         swz_op->swizzle = MAKE_SWIZZLE4(swizzle_x, swizzle_x,
+                                         swizzle_x, swizzle_x);
+      }
        vector_elements = MAX2(vector_elements,
                               ir->operands[1]->type->vector_elements);
     }
+   if (ir->operands[2] &&
+       ir->operands[2]->type->vector_elements != vector_elements) {
+      /* This can happen with ir_triop_lrp, i.e. glsl mix */
+      assert(ir->operands[2]->type->vector_elements == 1);
+      uint16_t swizzle_x = GET_SWZ(op[2].swizzle, 0);
+      op[2].swizzle = MAKE_SWIZZLE4(swizzle_x, swizzle_x,
+                                    swizzle_x, swizzle_x);
+   }
  
     this->result.file = PROGRAM_UNDEFINED;
  
@@ -1458,12 +1533,6 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
     case ir_binop_less:
        emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
        break;
-   case ir_binop_greater:
-      emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
-      break;
-   case ir_binop_lequal:
-      emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
-      break;
     case ir_binop_gequal:
        emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
        break;
@@ -1841,90 +1910,125 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
        break;
  
     case ir_binop_ubo_load: {
-      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
-      ir_constant *const_offset_ir = ir->operands[1]->as_constant();
-      unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
-      unsigned const_block = const_uniform_block ? const_uniform_block->value.u[0] + 1 : 1;
-      st_src_reg index_reg = get_temp(glsl_type::uint_type);
-      st_src_reg cbuf;
-
-      cbuf.type = ir->type->base_type;
-      cbuf.file = PROGRAM_CONSTANT;
-      cbuf.index = 0;
-      cbuf.reladdr = NULL;
-      cbuf.negate = 0;
-      cbuf.abs = 0;
-      cbuf.index2D = const_block;
-
-      assert(ir->type->is_vector() || ir->type->is_scalar());
-
-      if (const_offset_ir) {
-         /* Constant index into constant buffer */
-         cbuf.reladdr = NULL;
-         cbuf.index = const_offset / 16;
-      }
-      else {
-         ir_expression *offset_expr = ir->operands[1]->as_expression();
-         st_src_reg offset = op[1];
-
-         /* The OpenGL spec is written in such a way that accesses with
-          * non-constant offset are almost always vec4-aligned. The only
-          * exception to this are members of structs in arrays of structs:
-          * each struct in an array of structs is at least vec4-aligned,
-          * but single-element and [ui]vec2 members of the struct may be at
-          * an offset that is not a multiple of 16 bytes.
-          *
-          * Here, we extract that offset, relying on previous passes to always
-          * generate offset expressions of the form (+ expr constant_offset).
-          *
-          * Note that the std430 layout, which allows more cases of alignment
-          * less than vec4 in arrays, is not supported for uniform blocks, so
-          * we do not have to deal with it here.
-          */
-         if (offset_expr && offset_expr->operation == ir_binop_add) {
-            const_offset_ir = offset_expr->operands[1]->as_constant();
-            if (const_offset_ir) {
-               const_offset = const_offset_ir->value.u[0];
-               cbuf.index = const_offset / 16;
-               offset_expr->operands[0]->accept(this);
-               offset = this->result;
-            }
+      if (ctx->Const.UseSTD430AsDefaultPacking) {
+         ir_rvalue *block = ir->operands[0];
+         ir_rvalue *offset = ir->operands[1];
+         ir_constant *const_block = block->as_constant();
+
+         st_src_reg cbuf(PROGRAM_CONSTANT,
+            (const_block ? const_block->value.u[0] + 1 : 1),
+            ir->type->base_type);
+
+         cbuf.has_index2 = true;
+
+         if (!const_block) {
+            block->accept(this);
+            cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
+            *cbuf.reladdr = this->result;
+            emit_arl(ir, sampler_reladdr, this->result);
           }
  
-         /* Relative/variable index into constant buffer */
-         emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
-              st_src_reg_for_int(4));
-         cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
-         memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
-      }
+         /* Calculate the surface offset */
+         offset->accept(this);
+         st_src_reg off = this->result;
  
-      if (const_uniform_block) {
-         /* Constant constant buffer */
-         cbuf.reladdr2 = NULL;
-      }
-      else {
-         /* Relative/variable constant buffer */
-         cbuf.reladdr2 = ralloc(mem_ctx, st_src_reg);
-         memcpy(cbuf.reladdr2, &op[0], sizeof(st_src_reg));
-      }
-      cbuf.has_index2 = true;
-
-      cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
-      if (glsl_base_type_is_64bit(cbuf.type))
-         cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
-                                       const_offset % 16 / 8,
-                                       const_offset % 16 / 8,
-                                       const_offset % 16 / 8);
-      else
-         cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 4,
-                                       const_offset % 16 / 4,
-                                       const_offset % 16 / 4,
-                                       const_offset % 16 / 4);
+         glsl_to_tgsi_instruction *inst =
+            emit_asm(ir, TGSI_OPCODE_LOAD, result_dst, off);
+
+         if (result_dst.type == GLSL_TYPE_BOOL)
+            emit_asm(ir, TGSI_OPCODE_USNE, result_dst, st_src_reg(result_dst),
+                     st_src_reg_for_int(0));
  
-      if (ir->type->is_boolean()) {
-         emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
+         add_buffer_to_load_and_stores(inst, &cbuf, &this->instructions,
+                                       NULL);
        } else {
-         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
+         ir_constant *const_uniform_block = ir->operands[0]->as_constant();
+         ir_constant *const_offset_ir = ir->operands[1]->as_constant();
+         unsigned const_offset = const_offset_ir ?
+            const_offset_ir->value.u[0] : 0;
+         unsigned const_block = const_uniform_block ?
+            const_uniform_block->value.u[0] + 1 : 1;
+         st_src_reg index_reg = get_temp(glsl_type::uint_type);
+         st_src_reg cbuf;
+
+         cbuf.type = ir->type->base_type;
+         cbuf.file = PROGRAM_CONSTANT;
+         cbuf.index = 0;
+         cbuf.reladdr = NULL;
+         cbuf.negate = 0;
+         cbuf.abs = 0;
+         cbuf.index2D = const_block;
+
+         assert(ir->type->is_vector() || ir->type->is_scalar());
+
+         if (const_offset_ir) {
+            /* Constant index into constant buffer */
+            cbuf.reladdr = NULL;
+            cbuf.index = const_offset / 16;
+         } else {
+            ir_expression *offset_expr = ir->operands[1]->as_expression();
+            st_src_reg offset = op[1];
+
+            /* The OpenGL spec is written in such a way that accesses with
+             * non-constant offset are almost always vec4-aligned. The only
+             * exception to this are members of structs in arrays of structs:
+             * each struct in an array of structs is at least vec4-aligned,
+             * but single-element and [ui]vec2 members of the struct may be at
+             * an offset that is not a multiple of 16 bytes.
+             *
+             * Here, we extract that offset, relying on previous passes to
+             * always generate offset expressions of the form
+             * (+ expr constant_offset).
+             *
+             * Note that the std430 layout, which allows more cases of
+             * alignment less than vec4 in arrays, is not supported for
+             * uniform blocks, so we do not have to deal with it here.
+             */
+            if (offset_expr && offset_expr->operation == ir_binop_add) {
+               const_offset_ir = offset_expr->operands[1]->as_constant();
+               if (const_offset_ir) {
+                  const_offset = const_offset_ir->value.u[0];
+                  cbuf.index = const_offset / 16;
+                  offset_expr->operands[0]->accept(this);
+                  offset = this->result;
+               }
+            }
+
+            /* Relative/variable index into constant buffer */
+            emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), offset,
+                 st_src_reg_for_int(4));
+            cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
+            memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
+         }
+
+         if (const_uniform_block) {
+            /* Constant constant buffer */
+            cbuf.reladdr2 = NULL;
+         } else {
+            /* Relative/variable constant buffer */
+            cbuf.reladdr2 = ralloc(mem_ctx, st_src_reg);
+            memcpy(cbuf.reladdr2, &op[0], sizeof(st_src_reg));
+         }
+         cbuf.has_index2 = true;
+
+         cbuf.swizzle = swizzle_for_size(ir->type->vector_elements);
+         if (glsl_base_type_is_64bit(cbuf.type))
+            cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 8,
+                                          const_offset % 16 / 8,
+                                          const_offset % 16 / 8,
+                                          const_offset % 16 / 8);
+         else
+            cbuf.swizzle += MAKE_SWIZZLE4(const_offset % 16 / 4,
+                                          const_offset % 16 / 4,
+                                          const_offset % 16 / 4,
+                                          const_offset % 16 / 4);
+
+         if (ir->type->is_boolean()) {
+            emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf,
+                     st_src_reg_for_int(0));
+         } else {
+            emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
+         }
        }
        break;
     }
@@ -2032,6 +2136,8 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
     case ir_binop_ldexp:
        if (ir->operands[0]->type->is_double()) {
           emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
+      } else if (ir->operands[0]->type->is_float()) {
+         emit_asm(ir, TGSI_OPCODE_LDEXP, result_dst, op[0], op[1]);
        } else {
           assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
        }
@@ -2046,10 +2152,10 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
  
     case ir_unop_get_buffer_size: {
        ir_constant *const_offset = ir->operands[0]->as_constant();
+      int buf_base = ctx->st->has_hw_atomics ? 0 : ctx->Const.Program[shader->Stage].MaxAtomicBuffers;
        st_src_reg buffer(
              PROGRAM_BUFFER,
-            ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
-            (const_offset ? const_offset->value.u[0] : 0),
+            buf_base + (const_offset ? const_offset->value.u[0] : 0),
              GLSL_TYPE_UINT);
        if (!const_offset) {
           buffer.reladdr = ralloc(mem_ctx, st_src_reg);
@@ -2066,7 +2172,7 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
        st_src_reg temp = get_temp(glsl_type::uvec4_type);
        st_dst_reg temp_dst = st_dst_reg(temp);
        unsigned orig_swz = op[0].swizzle;
-      /* 
+      /*
         * To convert unsigned to 64-bit:
         * zero Y channel, copy X channel.
         */
@@ -2141,7 +2247,7 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
        break;
     }
     case ir_unop_i642b:
-      emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0], st_src_reg_for_int(0));
+      emit_asm(ir, TGSI_OPCODE_U64SNE, result_dst, op[0], st_src_reg_for_int64(0));
        break;
     case ir_unop_i642f:
        emit_asm(ir, TGSI_OPCODE_I642F, result_dst, op[0]);
@@ -2532,7 +2638,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
     ir->array->accept(this);
     src = this->result;
  
-   if (ir->array->ir_type != ir_type_dereference_array) {
+   if (!src.has_index2) {
        switch (this->prog->Target) {
        case GL_TESS_CONTROL_PROGRAM_NV:
           is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
@@ -2554,8 +2660,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
     if (index) {
  
        if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
-         src.file == PROGRAM_INPUT)
-        element_size = attrib_type_size(ir->type, true);
+          src.file == PROGRAM_INPUT)
+         element_size = attrib_type_size(ir->type, true);
        if (is_2D) {
           src.index2D = index->value.i[0];
           src.has_index2 = true;
@@ -2729,10 +2835,6 @@ glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
        /*      a is -  0  +            -  0  +
         * (a <  0)  T  F  F  ( a < 0)  T  F  F
         * (0 <  a)  F  F  T  (-a < 0)  F  F  T
-       * (a <= 0)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
-       * (0 <= a)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
-       * (a >  0)  F  F  T  (-a < 0)  F  F  T
-       * (0 >  a)  T  F  F  ( a < 0)  T  F  F
         * (a >= 0)  F  T  T  ( a < 0)  T  F  F  (swap order of other operands)
         * (0 >= a)  T  T  F  (-a < 0)  F  F  T  (swap order of other operands)
         *
@@ -2746,16 +2848,6 @@ glsl_to_tgsi_visitor::process_move_condition(ir_rvalue *ir)
              negate = zero_on_left;
              break;
  
-         case ir_binop_greater:
-            switch_order = false;
-            negate = !zero_on_left;
-            break;
-
-         case ir_binop_lequal:
-            switch_order = true;
-            negate = !zero_on_left;
-            break;
-
           case ir_binop_gequal:
              switch_order = true;
              negate = zero_on_left;
@@ -2822,7 +2914,15 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
     r->type = type->base_type;
     if (cond) {
        st_src_reg l_src = st_src_reg(*l);
-      l_src.swizzle = swizzle_for_size(type->vector_elements);
+
+      if (l_src.file == PROGRAM_OUTPUT &&
+          this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
+          (l_src.index == FRAG_RESULT_DEPTH || l_src.index == FRAG_RESULT_STENCIL)) {
+         /* This is a special case because the source swizzles will be shifted
+          * later to account for the difference between GLSL (where they're
+          * plain floats) and TGSI (where they're Z and Y components). */
+         l_src.swizzle = SWIZZLE_XXXX;
+      }
  
        if (native_integers) {
           emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
@@ -2841,7 +2941,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
     if (type->is_dual_slot()) {
        l->index++;
        if (r->is_double_vertex_input == false)
-        r->index++;
+         r->index++;
     }
  }
  
@@ -2941,6 +3041,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
        inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
        new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2], inst->src[3]);
        new_inst->saturate = inst->saturate;
+      new_inst->resource = inst->resource;
        inst->dead_mask = inst->dst[0].writemask;
     } else {
        emit_block_mov(ir, ir->rhs->type, &l, &r, NULL, false);
@@ -2969,7 +3070,8 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
        st_src_reg temp_base = get_temp(ir->type);
        st_dst_reg temp = st_dst_reg(temp_base);
  
-      foreach_in_list(ir_constant, field_value, &ir->components) {
+      for (i = 0; i < ir->type->length; i++) {
+         ir_constant *const field_value = ir->get_record_field(i);
           int size = type_size(field_value->type);
  
           assert(size > 0);
@@ -2977,7 +3079,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
           field_value->accept(this);
           src = this->result;
  
-         for (i = 0; i < (unsigned int)size; i++) {
+         for (unsigned j = 0; j < (unsigned int)size; j++) {
              emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
  
              src.index++;
@@ -2997,7 +3099,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
        in_array++;
  
        for (i = 0; i < ir->type->length; i++) {
-         ir->array_elements[i]->accept(this);
+         ir->const_elements[i]->accept(this);
           src = this->result;
           for (int j = 0; j < size; j++) {
              emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
@@ -3138,24 +3240,66 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
     exec_node *param = ir->actual_parameters.get_head();
     ir_dereference *deref = static_cast<ir_dereference *>(param);
     ir_variable *location = deref->variable_referenced();
-
-   st_src_reg buffer(
-         PROGRAM_BUFFER, location->data.binding, GLSL_TYPE_ATOMIC_UINT);
-
+   bool has_hw_atomics = st_context(ctx)->has_hw_atomics;
     /* Calculate the surface offset */
     st_src_reg offset;
     unsigned array_size = 0, base = 0;
     uint16_t index = 0;
+   st_src_reg resource;
  
     get_deref_offsets(deref, &array_size, &base, &index, &offset, false);
  
-   if (offset.file != PROGRAM_UNDEFINED) {
-      emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
-               offset, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
-      emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
-               offset, st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE));
+   if (has_hw_atomics) {
+      variable_storage *entry = find_variable_storage(location);
+      st_src_reg buffer(PROGRAM_HW_ATOMIC, 0, GLSL_TYPE_ATOMIC_UINT, location->data.binding);
+
+      if (!entry) {
+         entry = new(mem_ctx) variable_storage(location, PROGRAM_HW_ATOMIC,
+                                               num_atomics);
+         _mesa_hash_table_insert(this->variables, location, entry);
+
+         atomic_info[num_atomics].location = location->data.location;
+         atomic_info[num_atomics].binding = location->data.binding;
+         atomic_info[num_atomics].size = location->type->arrays_of_arrays_size();
+         if (atomic_info[num_atomics].size == 0)
+            atomic_info[num_atomics].size = 1;
+         atomic_info[num_atomics].array_id = 0;
+         num_atomics++;
+      }
+
+      if (offset.file != PROGRAM_UNDEFINED) {
+         if (atomic_info[entry->index].array_id == 0) {
+            num_atomic_arrays++;
+            atomic_info[entry->index].array_id = num_atomic_arrays;
+         }
+         buffer.array_id = atomic_info[entry->index].array_id;
+      }
+
+      buffer.index = index;
+      buffer.index += location->data.offset / ATOMIC_COUNTER_SIZE;
+      buffer.has_index2 = true;
+
+      if (offset.file != PROGRAM_UNDEFINED) {
+         buffer.reladdr = ralloc(mem_ctx, st_src_reg);
+         *buffer.reladdr = offset;
+         emit_arl(ir, sampler_reladdr, offset);
+      }
+      offset = st_src_reg_for_int(0);
+
+      resource = buffer;
     } else {
-      offset = st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE);
+      st_src_reg buffer(PROGRAM_BUFFER, location->data.binding,
+                        GLSL_TYPE_ATOMIC_UINT);
+
+      if (offset.file != PROGRAM_UNDEFINED) {
+         emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
+                  offset, st_src_reg_for_int(ATOMIC_COUNTER_SIZE));
+         emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(offset),
+                  offset, st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE));
+      } else {
+         offset = st_src_reg_for_int(location->data.offset + index * ATOMIC_COUNTER_SIZE);
+      }
+      resource = buffer;
     }
  
     ir->return_deref->accept(this);
@@ -3218,7 +3362,7 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
        inst = emit_asm(ir, opcode, dst, offset, data, data2);
     }
  
-   inst->resource = buffer;
+   inst->resource = resource;
  }
  
  void
@@ -3232,11 +3376,10 @@ glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
     ir_rvalue *offset = ((ir_instruction *)param)->as_rvalue();
  
     ir_constant *const_block = block->as_constant();
-
+   int buf_base = st_context(ctx)->has_hw_atomics ? 0 : ctx->Const.Program[shader->Stage].MaxAtomicBuffers;
     st_src_reg buffer(
           PROGRAM_BUFFER,
-         ctx->Const.Program[shader->Stage].MaxAtomicBuffers +
-         (const_block ? const_block->value.u[0] : 0),
+         buf_base + (const_block ? const_block->value.u[0] : 0),
           GLSL_TYPE_UINT);
  
     if (!const_block) {
@@ -3326,25 +3469,7 @@ glsl_to_tgsi_visitor::visit_ssbo_intrinsic(ir_call *ir)
        assert(access);
     }
  
-   /* The emit_asm() might have actually split the op into pieces, e.g. for
-    * double stores. We have to go back and fix up all the generated ops.
-    */
-   unsigned op = inst->op;
-   do {
-      inst->resource = buffer;
-      if (access)
-         inst->buffer_access = access->value.u[0];
-
-      if (inst == this->instructions.get_head_raw())
-         break;
-      inst = (glsl_to_tgsi_instruction *)inst->get_prev();
-
-      if (inst->op == TGSI_OPCODE_UADD) {
-         if (inst == this->instructions.get_head_raw())
-            break;
-         inst = (glsl_to_tgsi_instruction *)inst->get_prev();
-      }
-   } while (inst->op == op && inst->resource.file == PROGRAM_UNDEFINED);
+   add_buffer_to_load_and_stores(inst, &buffer, &this->instructions, access);
  }
  
  void
@@ -3556,6 +3681,12 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
  
     glsl_to_tgsi_instruction *inst;
  
+   st_src_reg bindless;
+   if (imgvar->contains_bindless()) {
+      img->accept(this);
+      bindless = this->result;
+   }
+
     if (ir->callee->intrinsic_id == ir_intrinsic_image_size) {
        dst.writemask = WRITEMASK_XYZ;
        inst = emit_asm(ir, TGSI_OPCODE_RESQ, dst);
@@ -3653,8 +3784,7 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
     }
  
     if (imgvar->contains_bindless()) {
-      img->accept(this);
-      inst->resource = this->result;
+      inst->resource = bindless;
        inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
                                               SWIZZLE_X, SWIZZLE_Y);
     } else {
@@ -4176,6 +4306,12 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
        emit_arl(ir, sampler_reladdr, reladdr);
     }
  
+   st_src_reg bindless;
+   if (var->contains_bindless()) {
+      ir->sampler->accept(this);
+      bindless = this->result;
+   }
+
     if (opcode == TGSI_OPCODE_TXD)
        inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
     else if (opcode == TGSI_OPCODE_TXQ) {
@@ -4206,8 +4342,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
        inst->tex_shadow = GL_TRUE;
  
     if (var->contains_bindless()) {
-      ir->sampler->accept(this);
-      inst->resource = this->result;
+      inst->resource = bindless;
        inst->resource.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y,
                                               SWIZZLE_X, SWIZZLE_Y);
     } else {
@@ -4328,13 +4463,14 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
     num_outputs = 0;
     num_input_arrays = 0;
     num_output_arrays = 0;
+   num_atomics = 0;
+   num_atomic_arrays = 0;
     num_immediates = 0;
     num_address_regs = 0;
     samplers_used = 0;
     images_used = 0;
     indirect_addr_consts = false;
     wpos_transform_const = -1;
-   glsl_version = 0;
     native_integers = false;
     mem_ctx = ralloc_context(NULL);
     ctx = NULL;
@@ -4379,6 +4515,7 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
  {
     v->samplers_used = 0;
     v->images_used = 0;
+   prog->info.textures_used_by_txf = 0;
  
     foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
        if (inst->info->is_tex) {
@@ -4391,8 +4528,8 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
              v->sampler_targets[idx] =
                 st_translate_texture_target(inst->tex_target, inst->tex_shadow);
  
-            if (inst->tex_shadow) {
-               prog->ShadowSamplers |= 1 << (inst->resource.index + i);
+            if (inst->op == TGSI_OPCODE_TXF || inst->op == TGSI_OPCODE_TXF_LZ) {
+               prog->info.textures_used_by_txf |= 1u << idx;
              }
           }
        }
@@ -4527,6 +4664,16 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
     free(tempWrites);
  }
  
+static void
+rename_temp_handle_src(struct rename_reg_pair *renames, st_src_reg *src)
+{
+   if (src && src->file == PROGRAM_TEMPORARY) {
+      int old_idx = src->index;
+      if (renames[old_idx].valid)
+         src->index = renames[old_idx].new_reg;
+   }
+}
+
  /* Replaces all references to a temporary register index with another index. */
  void
  glsl_to_tgsi_visitor::rename_temp_registers(struct rename_reg_pair *renames)
@@ -4534,32 +4681,29 @@ glsl_to_tgsi_visitor::rename_temp_registers(struct rename_reg_pair *renames)
     foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
        unsigned j;
        for (j = 0; j < num_inst_src_regs(inst); j++) {
-         if (inst->src[j].file == PROGRAM_TEMPORARY) {
-            int old_idx = inst->src[j].index;
-            if (renames[old_idx].valid)
-               inst->src[j].index = renames[old_idx].new_reg;
-         }
+         rename_temp_handle_src(renames, &inst->src[j]);
+         rename_temp_handle_src(renames, inst->src[j].reladdr);
+         rename_temp_handle_src(renames, inst->src[j].reladdr2);
        }
  
        for (j = 0; j < inst->tex_offset_num_offset; j++) {
-         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
-            int old_idx = inst->tex_offsets[j].index;
-            if (renames[old_idx].valid)
-               inst->tex_offsets[j].index = renames[old_idx].new_reg;
-         }
+         rename_temp_handle_src(renames, &inst->tex_offsets[j]);
+         rename_temp_handle_src(renames, inst->tex_offsets[j].reladdr);
+         rename_temp_handle_src(renames, inst->tex_offsets[j].reladdr2);
        }
  
-      if (inst->resource.file == PROGRAM_TEMPORARY) {
-         int old_idx = inst->resource.index;
-         if (renames[old_idx].valid)
-            inst->resource.index = renames[old_idx].new_reg;
-      }
+      rename_temp_handle_src(renames, &inst->resource);
+      rename_temp_handle_src(renames, inst->resource.reladdr);
+      rename_temp_handle_src(renames, inst->resource.reladdr2);
  
        for (j = 0; j < num_inst_dst_regs(inst); j++) {
           if (inst->dst[j].file == PROGRAM_TEMPORARY) {
              int old_idx = inst->dst[j].index;
              if (renames[old_idx].valid)
-               inst->dst[j].index = renames[old_idx].new_reg;}
+               inst->dst[j].index = renames[old_idx].new_reg;
+         }
+         rename_temp_handle_src(renames, inst->dst[j].reladdr);
+         rename_temp_handle_src(renames, inst->dst[j].reladdr2);
        }
     }
  }
@@ -4878,6 +5022,8 @@ glsl_to_tgsi_visitor::copy_propagate(void)
            !inst->dst[0].reladdr2 &&
            !inst->saturate &&
            inst->src[0].file != PROGRAM_ARRAY &&
+          (inst->src[0].file != PROGRAM_OUTPUT ||
+           this->shader->Stage != MESA_SHADER_TESS_CTRL) &&
            !inst->src[0].reladdr &&
            !inst->src[0].reladdr2 &&
            !inst->src[0].negate &&
@@ -4895,6 +5041,16 @@ glsl_to_tgsi_visitor::copy_propagate(void)
     ralloc_free(acp);
  }
  
+static void
+dead_code_handle_reladdr(glsl_to_tgsi_instruction **writes, st_src_reg *reladdr)
+{
+   if (reladdr && reladdr->file == PROGRAM_TEMPORARY) {
+      /* Clear where it's used as src. */
+      int swz = GET_SWZ(reladdr->swizzle, 0);
+      writes[4 * reladdr->index + swz] = NULL;
+   }
+}
+
  /*
   * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for dead
   * code elimination.
@@ -4985,6 +5141,8 @@ glsl_to_tgsi_visitor::eliminate_dead_code(void)
                       writes[4 * inst->src[i].index + c] = NULL;
                 }
              }
+            dead_code_handle_reladdr(writes, inst->src[i].reladdr);
+            dead_code_handle_reladdr(writes, inst->src[i].reladdr2);
           }
           for (unsigned i = 0; i < inst->tex_offset_num_offset; i++) {
              if (inst->tex_offsets[i].file == PROGRAM_TEMPORARY && inst->tex_offsets[i].reladdr){
@@ -5004,6 +5162,8 @@ glsl_to_tgsi_visitor::eliminate_dead_code(void)
                       writes[4 * inst->tex_offsets[i].index + c] = NULL;
                 }
              }
+            dead_code_handle_reladdr(writes, inst->tex_offsets[i].reladdr);
+            dead_code_handle_reladdr(writes, inst->tex_offsets[i].reladdr2);
           }
  
           if (inst->resource.file == PROGRAM_TEMPORARY) {
@@ -5019,7 +5179,13 @@ glsl_to_tgsi_visitor::eliminate_dead_code(void)
                    writes[4 * inst->resource.index + c] = NULL;
              }
           }
+         dead_code_handle_reladdr(writes, inst->resource.reladdr);
+         dead_code_handle_reladdr(writes, inst->resource.reladdr2);
  
+         for (unsigned i = 0; i < ARRAY_SIZE(inst->dst); i++) {
+            dead_code_handle_reladdr(writes, inst->dst[i].reladdr);
+            dead_code_handle_reladdr(writes, inst->dst[i].reladdr2);
+         }
           break;
        }
  
@@ -5089,9 +5255,11 @@ glsl_to_tgsi_visitor::eliminate_dead_code(void)
  void
  glsl_to_tgsi_visitor::merge_two_dsts(void)
  {
-   foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
+   /* We never delete inst, but we may delete its successor. */
+   foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
        glsl_to_tgsi_instruction *inst2;
-      bool merged;
+      unsigned defined;
+
        if (num_inst_dst_regs(inst) != 2)
           continue;
  
@@ -5099,32 +5267,39 @@ glsl_to_tgsi_visitor::merge_two_dsts(void)
            inst->dst[1].file != PROGRAM_UNDEFINED)
           continue;
  
-      inst2 = (glsl_to_tgsi_instruction *) inst->next;
-      do {
+      assert(inst->dst[0].file != PROGRAM_UNDEFINED ||
+             inst->dst[1].file != PROGRAM_UNDEFINED);
  
-         if (inst->src[0].file == inst2->src[0].file &&
+      if (inst->dst[0].file == PROGRAM_UNDEFINED)
+         defined = 1;
+      else
+         defined = 0;
+
+      inst2 = (glsl_to_tgsi_instruction *) inst->next;
+      while (!inst2->is_tail_sentinel()) {
+         if (inst->op == inst2->op &&
+             inst2->dst[defined].file == PROGRAM_UNDEFINED &&
+             inst->src[0].file == inst2->src[0].file &&
               inst->src[0].index == inst2->src[0].index &&
               inst->src[0].type == inst2->src[0].type &&
               inst->src[0].swizzle == inst2->src[0].swizzle)
              break;
           inst2 = (glsl_to_tgsi_instruction *) inst2->next;
-      } while (inst2);
+      }
  
-      if (!inst2)
+      if (inst2->is_tail_sentinel()) {
+         /* Undefined destinations are not allowed, substitute with an unused
+          * temporary register.
+          */
+         st_src_reg tmp = get_temp(glsl_type::vec4_type);
+         inst->dst[defined ^ 1] = st_dst_reg(tmp);
+         inst->dst[defined ^ 1].writemask = 0;
           continue;
-      merged = false;
-      if (inst->dst[0].file == PROGRAM_UNDEFINED) {
-         merged = true;
-         inst->dst[0] = inst2->dst[0];
-      } else if (inst->dst[1].file == PROGRAM_UNDEFINED) {
-         inst->dst[1] = inst2->dst[1];
-         merged = true;
        }
  
-      if (merged) {
-         inst2->remove();
-         delete inst2;
-      }
+      inst->dst[defined ^ 1] = inst2->dst[defined ^ 1];
+      inst2->remove();
+      delete inst2;
     }
  }
  
@@ -5136,54 +5311,19 @@ glsl_to_tgsi_visitor::merge_two_dsts(void)
  void
  glsl_to_tgsi_visitor::merge_registers(void)
  {
-   int *last_reads = ralloc_array(mem_ctx, int, this->next_temp);
-   int *first_writes = ralloc_array(mem_ctx, int, this->next_temp);
-   struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
-   int i, j;
-
-   /* Read the indices of the last read and first write to each temp register
-    * into an array so that we don't have to traverse the instruction list as
-    * much. */
-   for (i = 0; i < this->next_temp; i++) {
-      last_reads[i] = -1;
-      first_writes[i] = -1;
-   }
-   get_last_temp_read_first_temp_write(last_reads, first_writes);
+   struct lifetime *lifetimes =
+         rzalloc_array(mem_ctx, struct lifetime, this->next_temp);
  
-   /* Start looking for registers with non-overlapping usages that can be
-    * merged together. */
-   for (i = 0; i < this->next_temp; i++) {
-      /* Don't touch unused registers. */
-      if (last_reads[i] < 0 || first_writes[i] < 0) continue;
-
-      for (j = 0; j < this->next_temp; j++) {
-         /* Don't touch unused registers. */
-         if (last_reads[j] < 0 || first_writes[j] < 0) continue;
-
-         /* We can merge the two registers if the first write to j is after or
-          * in the same instruction as the last read from i.  Note that the
-          * register at index i will always be used earlier or at the same time
-          * as the register at index j. */
-         if (first_writes[i] <= first_writes[j] &&
-             last_reads[i] <= first_writes[j]) {
-            renames[j].new_reg = i;
-            renames[j].valid = true;
-
-            /* Update the first_writes and last_reads arrays with the new
-             * values for the merged register index, and mark the newly unused
-             * register index as such. */
-            assert(last_reads[j] >= last_reads[i]);
-            last_reads[i] = last_reads[j];
-            first_writes[j] = -1;
-            last_reads[j] = -1;
-         }
-      }
+   if (get_temp_registers_required_lifetimes(mem_ctx, &this->instructions,
+                                             this->next_temp, lifetimes)) {
+      struct rename_reg_pair *renames =
+            rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
+      get_temp_registers_remapping(mem_ctx, this->next_temp, lifetimes, renames);
+      rename_temp_registers(renames);
+      ralloc_free(renames);
     }
  
-   rename_temp_registers(renames);
-   ralloc_free(renames);
-   ralloc_free(last_reads);
-   ralloc_free(first_writes);
+   ralloc_free(lifetimes);
  }
  
  /* Reassign indices to temporary registers by reusing unused indices created
@@ -5240,6 +5380,7 @@ struct st_translate {
     struct ureg_src buffers[PIPE_MAX_SHADER_BUFFERS];
     struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
     struct ureg_src systemValues[SYSTEM_VALUE_MAX];
+   struct ureg_src hw_atomics[PIPE_MAX_HW_ATOMIC_BUFFERS];
     struct ureg_src shared_memory;
     unsigned *array_sizes;
     struct inout_decl *input_decls;
@@ -5251,6 +5392,7 @@ struct st_translate {
     const ubyte *outputMapping;
  
     unsigned procType;  /**< PIPE_SHADER_VERTEX/FRAGMENT */
+   bool need_uarl;
  };
  
  /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
@@ -5344,7 +5486,7 @@ _mesa_sysval_to_semantic(unsigned sysval)
  static struct ureg_src
  emit_immediate(struct st_translate *t,
                 gl_constant_value values[4],
-               int type, int size)
+               GLenum type, int size)
  {
     struct ureg_program *ureg = t->ureg;
  
@@ -5447,6 +5589,19 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
     }
  }
  
+static struct ureg_src
+translate_src(struct st_translate *t, const st_src_reg *src_reg);
+
+static struct ureg_src
+translate_addr(struct st_translate *t, const st_src_reg *reladdr,
+               unsigned addr_index)
+{
+   if (t->need_uarl || !reladdr->is_legal_tgsi_address_operand())
+      return ureg_src(t->address[addr_index]);
+
+   return translate_src(t, reladdr);
+}
+
  /**
   * Create a TGSI ureg_dst register from an st_dst_reg.
   */
@@ -5468,12 +5623,13 @@ translate_dst(struct st_translate *t,
  
     if (dst_reg->reladdr != NULL) {
        assert(dst_reg->file != PROGRAM_TEMPORARY);
-      dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
+      dst = ureg_dst_indirect(dst, translate_addr(t, dst_reg->reladdr, 0));
     }
  
     if (dst_reg->has_index2) {
        if (dst_reg->reladdr2)
-         dst = ureg_dst_dimension_indirect(dst, ureg_src(t->address[1]),
+         dst = ureg_dst_dimension_indirect(dst,
+                                           translate_addr(t, dst_reg->reladdr2, 1),
                                             dst_reg->index2D);
        else
           dst = ureg_dst_dimension(dst, dst_reg->index2D);
@@ -5566,6 +5722,11 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
        src = t->systemValues[src_reg->index];
        break;
  
+   case PROGRAM_HW_ATOMIC:
+      src = ureg_src_array_register(TGSI_FILE_HW_ATOMIC, src_reg->index,
+                                    src_reg->array_id);
+      break;
+
     default:
        assert(!"unknown src register file");
        return ureg_src_undef();
@@ -5576,7 +5737,8 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
         * and UBO constant buffers (buffer, position).
         */
        if (src_reg->reladdr2)
-         src = ureg_src_dimension_indirect(src, ureg_src(t->address[1]),
+         src = ureg_src_dimension_indirect(src,
+                                           translate_addr(t, src_reg->reladdr2, 1),
                                             src_reg->index2D);
        else
           src = ureg_src_dimension(src, src_reg->index2D);
@@ -5596,7 +5758,7 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
  
     if (src_reg->reladdr != NULL) {
        assert(src_reg->file != PROGRAM_TEMPORARY);
-      src = ureg_src_indirect(src, ureg_src(t->address[0]));
+      src = ureg_src_indirect(src, translate_addr(t, src_reg->reladdr, 0));
     }
  
     return src;
@@ -5684,7 +5846,8 @@ compile_tgsi_instruction(struct st_translate *t,
        assert(src[num_src].File != TGSI_FILE_NULL);
        if (inst->resource.reladdr)
           src[num_src] =
-            ureg_src_indirect(src[num_src], ureg_src(t->address[2]));
+            ureg_src_indirect(src[num_src],
+                              translate_addr(t, inst->resource.reladdr, 2));
        num_src++;
        for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
           texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
@@ -5719,7 +5882,13 @@ compile_tgsi_instruction(struct st_translate *t,
           src[0] = t->shared_memory;
        } else if (inst->resource.file == PROGRAM_BUFFER) {
           src[0] = t->buffers[inst->resource.index];
+      } else if (inst->resource.file == PROGRAM_HW_ATOMIC) {
+         src[0] = translate_src(t, &inst->resource);
+      } else if (inst->resource.file == PROGRAM_CONSTANT) {
+         assert(inst->resource.has_index2);
+         src[0] = ureg_src_register(TGSI_FILE_CONSTBUF, inst->resource.index);
        } else {
+         assert(inst->resource.file != PROGRAM_UNDEFINED);
           if (inst->resource.file == PROGRAM_IMAGE) {
              src[0] = t->images[inst->resource.index];
           } else {
@@ -5729,7 +5898,8 @@ compile_tgsi_instruction(struct st_translate *t,
           tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
        }
        if (inst->resource.reladdr)
-         src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2]));
+         src[0] = ureg_src_indirect(src[0],
+                                    translate_addr(t, inst->resource.reladdr, 2));
        assert(src[0].File != TGSI_FILE_NULL);
        ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
                         inst->buffer_access,
@@ -5752,7 +5922,8 @@ compile_tgsi_instruction(struct st_translate *t,
        }
        dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
        if (inst->resource.reladdr)
-         dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2]));
+         dst[0] = ureg_dst_indirect(dst[0],
+                                    translate_addr(t, inst->resource.reladdr, 2));
        assert(dst[0].File != TGSI_FILE_NULL);
        ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
                         inst->buffer_access,
@@ -6071,6 +6242,7 @@ st_translate_program(
     const ubyte outputSemanticName[],
     const ubyte outputSemanticIndex[])
  {
+   struct pipe_screen *screen = st_context(ctx)->pipe->screen;
     struct st_translate *t;
     unsigned i;
     struct gl_program_constants *frag_const =
@@ -6080,6 +6252,16 @@ st_translate_program(
     assert(numInputs <= ARRAY_SIZE(t->inputs));
     assert(numOutputs <= ARRAY_SIZE(t->outputs));
  
+   ASSERT_BITFIELD_SIZE(st_src_reg, type, GLSL_TYPE_ERROR);
+   ASSERT_BITFIELD_SIZE(st_dst_reg, type, GLSL_TYPE_ERROR);
+   ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, tex_type, GLSL_TYPE_ERROR);
+   ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, image_format, PIPE_FORMAT_COUNT);
+   ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, tex_target,
+                        (gl_texture_index) (NUM_TEXTURE_TARGETS - 1));
+   ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, image_format,
+                        (enum pipe_format) (PIPE_FORMAT_COUNT - 1));
+   ASSERT_BITFIELD_SIZE(glsl_to_tgsi_instruction, op, TGSI_OPCODE_LAST - 1);
+
     t = CALLOC_STRUCT(st_translate);
     if (!t) {
        ret = PIPE_ERROR_OUT_OF_MEMORY;
@@ -6087,6 +6269,7 @@ st_translate_program(
     }
  
     t->procType = procType;
+   t->need_uarl = !screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
     t->inputMapping = inputMapping;
     t->outputMapping = outputMapping;
     t->ureg = ureg;
@@ -6399,7 +6582,8 @@ st_translate_program(
     /* texture samplers */
     for (i = 0; i < frag_const->MaxTextureImageUnits; i++) {
        if (program->samplers_used & (1u << i)) {
-         unsigned type = st_translate_texture_type(program->sampler_types[i]);
+         enum tgsi_return_type type =
+            st_translate_texture_type(program->sampler_types[i]);
  
           t->samplers[i] = ureg_DECL_sampler(ureg, i);
  
@@ -6412,15 +6596,28 @@ st_translate_program(
     {
        struct gl_program *prog = program->prog;
  
-      for (i = 0; i < prog->info.num_abos; i++) {
-         unsigned index = prog->sh.AtomicBuffers[i]->Binding;
-         assert(index < frag_const->MaxAtomicBuffers);
-         t->buffers[index] = ureg_DECL_buffer(ureg, index, true);
+      if (!st_context(ctx)->has_hw_atomics) {
+        for (i = 0; i < prog->info.num_abos; i++) {
+            unsigned index = prog->sh.AtomicBuffers[i]->Binding;
+            assert(index < frag_const->MaxAtomicBuffers);
+            t->buffers[index] = ureg_DECL_buffer(ureg, index, true);
+         }
+      } else {
+         for (i = 0; i < program->num_atomics; i++) {
+            struct hwatomic_decl *ainfo = &program->atomic_info[i];
+            gl_uniform_storage *uni_storage = &prog->sh.data->UniformStorage[ainfo->location];
+            int base = uni_storage->offset / ATOMIC_COUNTER_SIZE;
+            ureg_DECL_hw_atomic(ureg, base, base + ainfo->size - 1, ainfo->binding,
+                                ainfo->array_id);
+         }
        }
  
        assert(prog->info.num_ssbos <= frag_const->MaxShaderStorageBlocks);
        for (i = 0; i < prog->info.num_ssbos; i++) {
-         unsigned index = frag_const->MaxAtomicBuffers + i;
+         unsigned index = i;
+         if (!st_context(ctx)->has_hw_atomics)
+            index += frag_const->MaxAtomicBuffers;
+
           t->buffers[index] = ureg_DECL_buffer(ureg, index, false);
        }
     }
@@ -6503,7 +6700,6 @@ get_mesa_program_tgsi(struct gl_context *ctx,
     v->shader_program = shader_program;
     v->shader = shader;
     v->options = options;
-   v->glsl_version = ctx->Const.GLSLVersion;
     v->native_integers = ctx->Const.NativeIntegers;
  
     v->have_sqrt = pscreen->get_shader_param(pscreen, ptarget,
@@ -6512,6 +6708,7 @@ get_mesa_program_tgsi(struct gl_context *ctx,
                                             PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED);
     v->has_tex_txf_lz = pscreen->get_param(pscreen,
                                            PIPE_CAP_TGSI_TEX_TXF_LZ);
+   v->need_uarl = !pscreen->get_param(pscreen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
  
     v->variables = _mesa_hash_table_create(v->mem_ctx, _mesa_hash_pointer,
                                            _mesa_key_pointer_equal);
@@ -6562,10 +6759,7 @@ get_mesa_program_tgsi(struct gl_context *ctx,
  
     /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
     v->simplify_cmp();
-
-   if (shader->Stage != MESA_SHADER_TESS_CTRL &&
-       shader->Stage != MESA_SHADER_TESS_EVAL)
-      v->copy_propagate();
+   v->copy_propagate();
  
     while (v->eliminate_dead_code());
  
@@ -6590,7 +6784,7 @@ get_mesa_program_tgsi(struct gl_context *ctx,
     _mesa_copy_linked_program_data(shader_program, shader);
     shrink_array_declarations(v->inputs, v->num_inputs,
                               &prog->info.inputs_read,
-                             prog->info.double_inputs_read,
+                             prog->info.vs.double_inputs_read,
                               &prog->info.patch_inputs_read);
     shrink_array_declarations(v->outputs, v->num_outputs,
                               &prog->info.outputs_written, 0ULL,
@@ -6731,6 +6925,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
     struct pipe_screen *pscreen = ctx->st->pipe->screen;
     assert(prog->data->LinkStatus);
  
+   bool use_nir = false;
     for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
        if (prog->_LinkedShaders[i] == NULL)
           continue;
@@ -6745,9 +6940,17 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                                     PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
        bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
                                                     PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED);
+      bool have_ldexp = pscreen->get_shader_param(pscreen, ptarget,
+                                                  PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED);
        unsigned if_threshold = pscreen->get_shader_param(pscreen, ptarget,
                                                          PIPE_SHADER_CAP_LOWER_IF_THRESHOLD);
  
+      enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir)
+         pscreen->get_shader_param(pscreen, ptarget,
+                                   PIPE_SHADER_CAP_PREFERRED_IR);
+      if (preferred_ir == PIPE_SHADER_IR_NIR)
+         use_nir = true;
+
        /* If there are forms of indirect addressing that the driver
         * cannot handle, perform the lowering pass.
         */
@@ -6795,7 +6998,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                           FDIV_TO_MUL_RCP |
                           EXP_TO_EXP2 |
                           LOG_TO_LOG2 |
-                         LDEXP_TO_ARITH |
+                         (have_ldexp ? 0 : LDEXP_TO_ARITH) |
                           (have_dfrexp ? 0 : DFREXP_DLDEXP_TO_ARITH) |
                           CARRY_TO_ARITH |
                           BORROW_TO_ARITH |
@@ -6851,34 +7054,19 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
  
     build_program_resource_list(ctx, prog);
  
+   if (use_nir)
+      return st_link_nir(ctx, prog);
+
     for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
        struct gl_linked_shader *shader = prog->_LinkedShaders[i];
        if (shader == NULL)
           continue;
  
-      enum pipe_shader_type ptarget =
-         pipe_shader_type_from_mesa(shader->Stage);
-      enum pipe_shader_ir preferred_ir = (enum pipe_shader_ir)
-         pscreen->get_shader_param(pscreen, ptarget,
-                                   PIPE_SHADER_CAP_PREFERRED_IR);
-
-      struct gl_program *linked_prog = NULL;
-      if (preferred_ir == PIPE_SHADER_IR_NIR) {
-         /* TODO only for GLSL VS/FS/CS for now: */
-         switch (shader->Stage) {
-         case MESA_SHADER_VERTEX:
-         case MESA_SHADER_FRAGMENT:
-         case MESA_SHADER_COMPUTE:
-            linked_prog = st_nir_get_mesa_program(ctx, prog, shader);
-         default:
-            break;
-         }
-      } else {
-         linked_prog = get_mesa_program_tgsi(ctx, prog, shader);
-      }
+      struct gl_program *linked_prog =
+         get_mesa_program_tgsi(ctx, prog, shader);
+      st_set_prog_affected_state_flags(linked_prog);
  
        if (linked_prog) {
-         st_set_prog_affected_state_flags(linked_prog);
           if (!ctx->Driver.ProgramStringNotify(ctx,
                                                _mesa_shader_stage_to_program(i),
                                                linked_prog)) {