i965: Make the userclip flag for the VUE map come from VS prog data.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
index 7e0535b5c023310285d8b128470b1aae263e7ee7..5dfe1c1354c49adb52ce7dfcae45dbdfdc91804b 100644 (file)
@@ -25,6 +25,7 @@
 extern "C" {
 #include "main/macros.h"
 #include "program/prog_parameter.h"
+#include "program/sampler.h"
 }
 
 namespace brw {
@@ -37,6 +38,8 @@ src_reg::src_reg(dst_reg reg)
    this->reg = reg.reg;
    this->reg_offset = reg.reg_offset;
    this->type = reg.type;
+   this->reladdr = reg.reladdr;
+   this->fixed_hw_reg = reg.fixed_hw_reg;
 
    int swizzles[4];
    int next_chan = 0;
@@ -66,45 +69,183 @@ dst_reg::dst_reg(src_reg reg)
    this->reg_offset = reg.reg_offset;
    this->type = reg.type;
    this->writemask = WRITEMASK_XYZW;
+   this->reladdr = reg.reladdr;
+   this->fixed_hw_reg = reg.fixed_hw_reg;
+}
+
+vec4_instruction::vec4_instruction(vec4_visitor *v,
+                                  enum opcode opcode, dst_reg dst,
+                                  src_reg src0, src_reg src1, src_reg src2)
+{
+   this->opcode = opcode;
+   this->dst = dst;
+   this->src[0] = src0;
+   this->src[1] = src1;
+   this->src[2] = src2;
+   this->ir = v->base_ir;
+   this->annotation = v->current_annotation;
 }
 
 vec4_instruction *
-vec4_visitor::emit(enum opcode opcode, dst_reg dst,
-                  src_reg src0, src_reg src1, src_reg src2)
+vec4_visitor::emit(vec4_instruction *inst)
 {
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction();
+   this->instructions.push_tail(inst);
+
+   return inst;
+}
 
-   inst->opcode = opcode;
-   inst->dst = dst;
-   inst->src[0] = src0;
-   inst->src[1] = src1;
-   inst->src[2] = src2;
-   inst->ir = this->base_ir;
-   inst->annotation = this->current_annotation;
+vec4_instruction *
+vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
+{
+   new_inst->ir = inst->ir;
+   new_inst->annotation = inst->annotation;
 
-   this->instructions.push_tail(inst);
+   inst->insert_before(new_inst);
 
    return inst;
 }
 
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst,
+                  src_reg src0, src_reg src1, src_reg src2)
+{
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
+                                            src0, src1, src2));
+}
+
 
 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 {
-   return emit(opcode, dst, src0, src1, src_reg());
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 }
 
 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 {
-   assert(dst.writemask != 0);
-   return emit(opcode, dst, src0, src_reg(), src_reg());
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 }
 
 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode)
 {
-   return emit(opcode, dst_reg(), src_reg(), src_reg(), src_reg());
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
+}
+
+#define ALU1(op)                                                       \
+   vec4_instruction *                                                  \
+   vec4_visitor::op(dst_reg dst, src_reg src0)                         \
+   {                                                                   \
+      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
+                                          src0);                       \
+   }
+
+#define ALU2(op)                                                       \
+   vec4_instruction *                                                  \
+   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)           \
+   {                                                                   \
+      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
+                                          src0, src1);                 \
+   }
+
+ALU1(NOT)
+ALU1(MOV)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
+ALU2(ADD)
+ALU2(MUL)
+ALU2(MACH)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(DP3)
+ALU2(DP4)
+
+/** Gen4 predicated IF. */
+vec4_instruction *
+vec4_visitor::IF(uint32_t predicate)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
+   inst->predicate = predicate;
+
+   return inst;
+}
+
+/** Gen6+ IF with embedded comparison. */
+vec4_instruction *
+vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
+{
+   assert(intel->gen >= 6);
+
+   vec4_instruction *inst;
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
+                                       src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+/**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+vec4_instruction *
+vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
+{
+   vec4_instruction *inst;
+
+   /* original gen4 does type conversion to the destination type
+    * before before comparison, producing garbage results for floating
+    * point comparisons.
+    */
+   if (intel->gen == 4) {
+      dst.type = src0.type;
+      if (dst.file == HW_REG)
+        dst.fixed_hw_reg.type = dst.type;
+   }
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
+                                       dst, index);
+   inst->base_mrf = 14;
+   inst->mlen = 1;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
+                                       dst, src, index);
+   inst->base_mrf = 13;
+   inst->mlen = 2;
+
+   return inst;
 }
 
 void
@@ -123,11 +264,28 @@ vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
    /* The gen6 math instruction ignores the source modifiers --
     * swizzle, abs, negate, and at least some parts of the register
     * region description.
+    *
+    * While it would seem that this MOV could be avoided at this point
+    * in the case that the swizzle is matched up with the destination
+    * writemask, note that uniform packing and register allocation
+    * could rearrange our swizzle, so let's leave this matter up to
+    * copy propagation later.
     */
    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
-   emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
+   emit(MOV(dst_reg(temp_src), src));
+
+   if (dst.writemask != WRITEMASK_XYZW) {
+      /* The gen6 math instruction must be align1, so we can't do
+       * writemasks.
+       */
+      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
+
+      emit(opcode, temp_dst, temp_src);
 
-   emit(opcode, dst, temp_src);
+      emit(MOV(dst, src_reg(temp_dst)));
+   } else {
+      emit(opcode, dst, temp_src);
+   }
 }
 
 void
@@ -155,7 +313,9 @@ vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
       return;
    }
 
-   if (intel->gen >= 6) {
+   if (intel->gen >= 7) {
+      emit(opcode, dst, src);
+   } else if (intel->gen == 6) {
       return emit_math1_gen6(opcode, dst, src);
    } else {
       return emit_math1_gen4(opcode, dst, src);
@@ -175,14 +335,28 @@ vec4_visitor::emit_math2_gen6(enum opcode opcode,
     */
 
    expanded = src_reg(this, glsl_type::vec4_type);
-   emit(BRW_OPCODE_MOV, dst, src0);
+   expanded.type = src0.type;
+   emit(MOV(dst_reg(expanded), src0));
    src0 = expanded;
 
    expanded = src_reg(this, glsl_type::vec4_type);
-   emit(BRW_OPCODE_MOV, dst, src1);
+   expanded.type = src1.type;
+   emit(MOV(dst_reg(expanded), src1));
    src1 = expanded;
 
-   emit(opcode, dst, src0, src1);
+   if (dst.writemask != WRITEMASK_XYZW) {
+      /* The gen6 math instruction must be align1, so we can't do
+       * writemasks.
+       */
+      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
+      temp_dst.type = dst.type;
+
+      emit(opcode, temp_dst, src0, src1);
+
+      emit(MOV(dst, src_reg(temp_dst)));
+   } else {
+      emit(opcode, dst, src0, src1);
+   }
 }
 
 void
@@ -198,9 +372,19 @@ void
 vec4_visitor::emit_math(enum opcode opcode,
                        dst_reg dst, src_reg src0, src_reg src1)
 {
-   assert(opcode == SHADER_OPCODE_POW);
+   switch (opcode) {
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      break;
+   default:
+      assert(!"not reached: unsupported binary math opcode");
+      return;
+   }
 
-   if (intel->gen >= 6) {
+   if (intel->gen >= 7) {
+      emit(opcode, dst, src0, src1);
+   } else if (intel->gen == 6) {
       return emit_math2_gen6(opcode, dst, src0, src1);
    } else {
       return emit_math2_gen4(opcode, dst, src0, src1);
@@ -210,8 +394,8 @@ vec4_visitor::emit_math(enum opcode opcode,
 void
 vec4_visitor::visit_instructions(const exec_list *list)
 {
-   foreach_iter(exec_list_iterator, iter, *list) {
-      ir_instruction *ir = (ir_instruction *)iter.get();
+   foreach_list(node, list) {
+      ir_instruction *ir = (ir_instruction *)node;
 
       base_ir = ir;
       ir->accept(this);
@@ -270,7 +454,11 @@ vec4_visitor::virtual_grf_alloc(int size)
         virtual_grf_array_size *= 2;
       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
                                   virtual_grf_array_size);
+      virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
+                                    virtual_grf_array_size);
    }
+   virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
+   virtual_grf_reg_count += size;
    virtual_grf_sizes[virtual_grf_count] = size;
    return virtual_grf_count++;
 }
@@ -319,9 +507,7 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 
    if (type->is_matrix()) {
-      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
-                                                       type->vector_elements,
-                                                       1);
+      const glsl_type *column = type->column_type();
 
       for (unsigned int i = 0; i < type->matrix_columns; i++) {
         offset += setup_uniform_values(loc + offset, column);
@@ -336,35 +522,20 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    case GLSL_TYPE_INT:
    case GLSL_TYPE_BOOL:
       for (unsigned int i = 0; i < type->vector_elements; i++) {
-        int slot = this->uniforms * 4 + i;
-        switch (type->base_type) {
-        case GLSL_TYPE_FLOAT:
-           c->prog_data.param_convert[slot] = PARAM_NO_CONVERT;
-           break;
-        case GLSL_TYPE_UINT:
-           c->prog_data.param_convert[slot] = PARAM_CONVERT_F2U;
-           break;
-        case GLSL_TYPE_INT:
-           c->prog_data.param_convert[slot] = PARAM_CONVERT_F2I;
-           break;
-        case GLSL_TYPE_BOOL:
-           c->prog_data.param_convert[slot] = PARAM_CONVERT_F2B;
-           break;
-        default:
-           assert(!"not reached");
-           c->prog_data.param_convert[slot] = PARAM_NO_CONVERT;
-           break;
-        }
-        c->prog_data.param[slot] = &values[i];
+        c->prog_data.param[this->uniforms * 4 + i] = &values[i];
       }
 
+      /* Set up pad elements to get things aligned to a vec4 boundary. */
       for (unsigned int i = type->vector_elements; i < 4; i++) {
-        c->prog_data.param_convert[this->uniforms * 4 + i] =
-           PARAM_CONVERT_ZERO;
-        c->prog_data.param[this->uniforms * 4 + i] = NULL;
+        static float zero = 0;
+
+        c->prog_data.param[this->uniforms * 4 + i] = &zero;
       }
 
-      this->uniform_size[this->uniforms] = type->vector_elements;
+      /* Track the size of this uniform vector, for future packing of
+       * uniforms.
+       */
+      this->uniform_vector_size[this->uniforms] = type->vector_elements;
       this->uniforms++;
 
       return 1;
@@ -392,6 +563,37 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    }
 }
 
+void
+vec4_visitor::setup_uniform_clipplane_values()
+{
+   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
+
+   /* Pre-Gen6, we compact clip planes.  For example, if the user
+    * enables just clip planes 0, 1, and 3, we will enable clip planes
+    * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
+    * plane 2.  This simplifies the implementation of the Gen6 clip
+    * thread.
+    *
+    * In Gen6 and later, we don't compact clip planes, because this
+    * simplifies the implementation of gl_ClipDistance.
+    */
+   int compacted_clipplane_index = 0;
+   for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
+      if (intel->gen < 6 &&
+          !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
+         continue;
+      }
+      this->uniform_vector_size[this->uniforms] = 4;
+      this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
+      this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
+      for (int j = 0; j < 4; ++j) {
+         c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
+      }
+      ++compacted_clipplane_index;
+      ++this->uniforms;
+   }
+}
+
 /* Our support for builtin uniforms is even scarier than non-builtin.
  * It sits on top of the PROG_STATE_VAR parameters that are
  * automatically updated from GL context state.
@@ -412,20 +614,18 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
                                            (gl_state_index *)slots[i].tokens);
       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 
-      this->uniform_size[this->uniforms] = 0;
+      this->uniform_vector_size[this->uniforms] = 0;
       /* Add each of the unique swizzled channels of the element.
        * This will end up matching the size of the glsl_type of this field.
        */
       int last_swiz = -1;
       for (unsigned int j = 0; j < 4; j++) {
         int swiz = GET_SWZ(slots[i].swizzle, j);
-        if (swiz == last_swiz)
-           break;
         last_swiz = swiz;
 
         c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
-        c->prog_data.param_convert[this->uniforms * 4 + j] = PARAM_NO_CONVERT;
-        this->uniform_size[this->uniforms]++;
+        if (swiz <= last_swiz)
+           this->uniform_vector_size[this->uniforms]++;
       }
       this->uniforms++;
    }
@@ -438,59 +638,76 @@ vec4_visitor::variable_storage(ir_variable *var)
 }
 
 void
-vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
+vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 {
    ir_expression *expr = ir->as_expression();
 
+   *predicate = BRW_PREDICATE_NORMAL;
+
    if (expr) {
       src_reg op[2];
       vec4_instruction *inst;
 
       assert(expr->get_num_operands() <= 2);
       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-        assert(expr->operands[i]->type->is_scalar());
-
         expr->operands[i]->accept(this);
         op[i] = this->result;
+
+        resolve_ud_negate(&op[i]);
       }
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-        inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
+        inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
         inst->conditional_mod = BRW_CONDITIONAL_Z;
         break;
 
       case ir_binop_logic_xor:
-        inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
+        inst = emit(XOR(dst_null_d(), op[0], op[1]));
         inst->conditional_mod = BRW_CONDITIONAL_NZ;
         break;
 
       case ir_binop_logic_or:
-        inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
+        inst = emit(OR(dst_null_d(), op[0], op[1]));
         inst->conditional_mod = BRW_CONDITIONAL_NZ;
         break;
 
       case ir_binop_logic_and:
-        inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
+        inst = emit(AND(dst_null_d(), op[0], op[1]));
         inst->conditional_mod = BRW_CONDITIONAL_NZ;
         break;
 
       case ir_unop_f2b:
         if (intel->gen >= 6) {
-           inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
+           emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
         } else {
-           inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
+           inst = emit(MOV(dst_null_f(), op[0]));
+           inst->conditional_mod = BRW_CONDITIONAL_NZ;
         }
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
         break;
 
       case ir_unop_i2b:
         if (intel->gen >= 6) {
-           inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+           emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
         } else {
-           inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
+           inst = emit(MOV(dst_null_d(), op[0]));
+           inst->conditional_mod = BRW_CONDITIONAL_NZ;
         }
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        break;
+
+      case ir_binop_all_equal:
+        inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
+        *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+        break;
+
+      case ir_binop_any_nequal:
+        inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
+        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+        break;
+
+      case ir_unop_any:
+        inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
         break;
 
       case ir_binop_greater:
@@ -498,12 +715,9 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
       case ir_binop_less:
       case ir_binop_lequal:
       case ir_binop_equal:
-      case ir_binop_all_equal:
       case ir_binop_nequal:
-      case ir_binop_any_nequal:
-        inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
-        inst->conditional_mod =
-           brw_conditional_for_comparison(expr->operation);
+        emit(CMP(dst_null_d(), op[0], op[1],
+                 brw_conditional_for_comparison(expr->operation)));
         break;
 
       default:
@@ -515,12 +729,14 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
 
    ir->accept(this);
 
+   resolve_ud_negate(&this->result);
+
    if (intel->gen >= 6) {
-      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
-                              this->result, src_reg(1));
+      vec4_instruction *inst = emit(AND(dst_null_d(),
+                                       this->result, src_reg(1)));
       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    } else {
-      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
+      vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    }
 }
@@ -536,52 +752,41 @@ vec4_visitor::emit_if_gen6(ir_if *ir)
 
    if (expr) {
       src_reg op[2];
-      vec4_instruction *inst;
       dst_reg temp;
 
       assert(expr->get_num_operands() <= 2);
       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-        assert(expr->operands[i]->type->is_scalar() ||
-               expr->operation == ir_binop_any_nequal ||
-               expr->operation == ir_binop_all_equal);
-
         expr->operands[i]->accept(this);
         op[i] = this->result;
       }
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
-        inst->conditional_mod = BRW_CONDITIONAL_Z;
+        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
         return;
 
       case ir_binop_logic_xor:
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
         return;
 
       case ir_binop_logic_or:
         temp = dst_reg(this, glsl_type::bool_type);
-        emit(BRW_OPCODE_OR, temp, op[0], op[1]);
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(OR(temp, op[0], op[1]));
+        emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
         return;
 
       case ir_binop_logic_and:
         temp = dst_reg(this, glsl_type::bool_type);
-        emit(BRW_OPCODE_AND, temp, op[0], op[1]);
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(AND(temp, op[0], op[1]));
+        emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
         return;
 
       case ir_unop_f2b:
-        inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
         return;
 
       case ir_unop_i2b:
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
         return;
 
       case ir_binop_greater:
@@ -590,31 +795,28 @@ vec4_visitor::emit_if_gen6(ir_if *ir)
       case ir_binop_lequal:
       case ir_binop_equal:
       case ir_binop_nequal:
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
-        inst->conditional_mod =
-           brw_conditional_for_comparison(expr->operation);
+        emit(IF(op[0], op[1],
+                brw_conditional_for_comparison(expr->operation)));
         return;
 
       case ir_binop_all_equal:
-        inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_Z;
-
-        inst = emit(BRW_OPCODE_IF);
-        inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
+        emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
         return;
 
       case ir_binop_any_nequal:
-        inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
+        emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
+        return;
 
-        inst = emit(BRW_OPCODE_IF);
-        inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      case ir_unop_any:
+        emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+        emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
         return;
 
       default:
         assert(!"not reached");
-        inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
         return;
       }
       return;
@@ -622,9 +824,7 @@ vec4_visitor::emit_if_gen6(ir_if *ir)
 
    ir->condition->accept(this);
 
-   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
-                           this->result, src_reg(0));
-   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 }
 
 void
@@ -638,6 +838,19 @@ vec4_visitor::visit(ir_variable *ir)
    switch (ir->mode) {
    case ir_var_in:
       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
+
+      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+       * come in as floating point conversions of the integer values.
+       */
+      for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
+        if (!c->key.gl_fixed_input_size[i])
+           continue;
+
+        dst_reg dst = *reg;
+         dst.type = brw_type_for_base_type(ir->type);
+        dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
+        emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
+      }
       break;
 
    case ir_var_out:
@@ -646,6 +859,9 @@ vec4_visitor::visit(ir_variable *ir)
       for (int i = 0; i < type_size(ir->type); i++) {
         output_reg[ir->location + i] = *reg;
         output_reg[ir->location + i].reg_offset = i;
+        output_reg[ir->location + i].type =
+            brw_type_for_base_type(ir->type->get_scalar_type());
+        output_reg_annotation[ir->location + i] = ir->name;
       }
       break;
 
@@ -657,6 +873,11 @@ vec4_visitor::visit(ir_variable *ir)
    case ir_var_uniform:
       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 
+      /* Track how big the whole uniform variable is, in case we need to put a
+       * copy of its data into pull constants for array access.
+       */
+      this->uniform_size[this->uniforms] = type_size(ir->type);
+
       if (!strncmp(ir->name, "gl_", 3)) {
         setup_builtin_uniform_values(ir);
       } else {
@@ -664,6 +885,27 @@ vec4_visitor::visit(ir_variable *ir)
       }
       break;
 
+   case ir_var_system_value:
+      /* VertexID is stored by the VF as the last vertex element, but
+       * we don't represent it with a flag in inputs_read, so we call
+       * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
+       */
+      reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
+      prog_data->uses_vertexid = true;
+
+      switch (ir->location) {
+      case SYSTEM_VALUE_VERTEX_ID:
+        reg->writemask = WRITEMASK_X;
+        break;
+      case SYSTEM_VALUE_INSTANCE_ID:
+        reg->writemask = WRITEMASK_Y;
+        break;
+      default:
+        assert(!"not reached");
+        break;
+      }
+      break;
+
    default:
       assert(!"not reached");
    }
@@ -675,58 +917,46 @@ vec4_visitor::visit(ir_variable *ir)
 void
 vec4_visitor::visit(ir_loop *ir)
 {
-   ir_dereference_variable *counter = NULL;
-
-   fail("not yet\n");
+   dst_reg counter;
 
    /* We don't want debugging output to print the whole body of the
     * loop as the annotation.
     */
    this->base_ir = NULL;
 
-   if (ir->counter != NULL)
-      counter = new(ir) ir_dereference_variable(ir->counter);
-
-   if (ir->from != NULL) {
-      assert(ir->counter != NULL);
+   if (ir->counter != NULL) {
+      this->base_ir = ir->counter;
+      ir->counter->accept(this);
+      counter = *(variable_storage(ir->counter));
 
-      ir_assignment *a = new(ir) ir_assignment(counter, ir->from, NULL);
+      if (ir->from != NULL) {
+        this->base_ir = ir->from;
+        ir->from->accept(this);
 
-      a->accept(this);
-      delete a;
+        emit(MOV(counter, this->result));
+      }
    }
 
    emit(BRW_OPCODE_DO);
 
    if (ir->to) {
-      ir_expression *e =
-        new(ir) ir_expression(ir->cmp, glsl_type::bool_type,
-                              counter, ir->to);
-      ir_if *if_stmt =  new(ir) ir_if(e);
-
-      ir_loop_jump *brk = new(ir) ir_loop_jump(ir_loop_jump::jump_break);
+      this->base_ir = ir->to;
+      ir->to->accept(this);
 
-      if_stmt->then_instructions.push_tail(brk);
+      emit(CMP(dst_null_d(), src_reg(counter), this->result,
+              brw_conditional_for_comparison(ir->cmp)));
 
-      if_stmt->accept(this);
-
-      delete if_stmt;
-      delete e;
-      delete brk;
+      vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
+      inst->predicate = BRW_PREDICATE_NORMAL;
    }
 
    visit_instructions(&ir->body_instructions);
 
-   if (ir->increment) {
-      ir_expression *e =
-        new(ir) ir_expression(ir_binop_add, counter->type,
-                              counter, ir->increment);
-
-      ir_assignment *a = new(ir) ir_assignment(counter, e, NULL);
 
-      a->accept(this);
-      delete a;
-      delete e;
+   if (ir->increment) {
+      this->base_ir = ir->increment;
+      ir->increment->accept(this);
+      emit(ADD(counter, src_reg(counter), this->result));
    }
 
    emit(BRW_OPCODE_WHILE);
@@ -771,7 +1001,7 @@ vec4_visitor::visit(ir_function *ir)
    }
 }
 
-GLboolean
+bool
 vec4_visitor::try_emit_sat(ir_expression *ir)
 {
    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
@@ -783,7 +1013,7 @@ vec4_visitor::try_emit_sat(ir_expression *ir)
 
    this->result = src_reg(this, ir->type);
    vec4_instruction *inst;
-   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
+   inst = emit(MOV(dst_reg(this->result), src));
    inst->saturate = true;
 
    return true;
@@ -797,11 +1027,10 @@ vec4_visitor::emit_bool_comparison(unsigned int op,
    if (intel->gen < 5)
       dst.type = src0.type;
 
-   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
-   inst->conditional_mod = brw_conditional_for_comparison(op);
+   emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 
    dst.type = BRW_REGISTER_TYPE_D;
-   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
+   emit(AND(dst, src_reg(dst), src_reg(0x1)));
 }
 
 void
@@ -859,7 +1088,7 @@ vec4_visitor::visit(ir_expression *ir)
       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
        * ones complement of the whole register, not just bit 0.
        */
-      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
+      emit(XOR(result_dst, op[0], src_reg(1)));
       break;
    case ir_unop_neg:
       op[0].negate = !op[0].negate;
@@ -872,16 +1101,14 @@ vec4_visitor::visit(ir_expression *ir)
       break;
 
    case ir_unop_sign:
-      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
+      emit(MOV(result_dst, src_reg(0.0f)));
 
-      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_G;
-      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
+      emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
+      inst = emit(MOV(result_dst, src_reg(1.0f)));
       inst->predicate = BRW_PREDICATE_NORMAL;
 
-      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_L;
-      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
+      emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
+      inst = emit(MOV(result_dst, src_reg(-1.0f)));
       inst->predicate = BRW_PREDICATE_NORMAL;
 
       break;
@@ -919,19 +1146,40 @@ vec4_visitor::visit(ir_expression *ir)
       break;
 
    case ir_binop_add:
-      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
+      emit(ADD(result_dst, op[0], op[1]));
       break;
    case ir_binop_sub:
       assert(!"not reached: should be handled by ir_sub_to_add_neg");
       break;
 
    case ir_binop_mul:
-      emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
+      if (ir->type->is_integer()) {
+        /* For integer multiplication, the MUL uses the low 16 bits
+         * of one of the operands (src0 on gen6, src1 on gen7).  The
+         * MACH accumulates in the contribution of the upper 16 bits
+         * of that operand.
+         *
+         * FINISHME: Emit just the MUL if we know an operand is small
+         * enough.
+         */
+        struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
+
+        emit(MUL(acc, op[0], op[1]));
+        emit(MACH(dst_null_d(), op[0], op[1]));
+        emit(MOV(result_dst, src_reg(acc)));
+      } else {
+        emit(MUL(result_dst, op[0], op[1]));
+      }
       break;
    case ir_binop_div:
-      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
+      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
+      assert(ir->type->is_integer());
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
+      break;
    case ir_binop_mod:
-      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
+      /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
+      assert(ir->type->is_integer());
+      emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_less:
@@ -940,14 +1188,9 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_binop_gequal:
    case ir_binop_equal:
    case ir_binop_nequal: {
-      dst_reg temp = result_dst;
-      /* original gen4 does implicit conversion before comparison. */
-      if (intel->gen < 5)
-        temp.type = op[0].type;
-
-      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
-      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
+      emit(CMP(result_dst, op[0], op[1],
+              brw_conditional_for_comparison(ir->operation)));
+      emit(AND(result_dst, result_src, src_reg(0x1)));
       break;
    }
 
@@ -955,63 +1198,48 @@ vec4_visitor::visit(ir_expression *ir)
       /* "==" operator producing a scalar boolean. */
       if (ir->operands[0]->type->is_vector() ||
          ir->operands[1]->type->is_vector()) {
-        inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_Z;
-
-        emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
-        inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
+        emit(MOV(result_dst, src_reg(0)));
+        inst = emit(MOV(result_dst, src_reg(1)));
         inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
       } else {
-        dst_reg temp = result_dst;
-        /* original gen4 does implicit conversion before comparison. */
-        if (intel->gen < 5)
-           temp.type = op[0].type;
-
-        inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
-        emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
+        emit(AND(result_dst, result_src, src_reg(0x1)));
       }
       break;
    case ir_binop_any_nequal:
       /* "!=" operator producing a scalar boolean. */
       if (ir->operands[0]->type->is_vector() ||
          ir->operands[1]->type->is_vector()) {
-        inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
+        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 
-        emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
-        inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+        emit(MOV(result_dst, src_reg(0)));
+        inst = emit(MOV(result_dst, src_reg(1)));
         inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
       } else {
-        dst_reg temp = result_dst;
-        /* original gen4 does implicit conversion before comparison. */
-        if (intel->gen < 5)
-           temp.type = op[0].type;
-
-        inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-        inst->conditional_mod = BRW_CONDITIONAL_NZ;
-        emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
+        emit(AND(result_dst, result_src, src_reg(0x1)));
       }
       break;
 
    case ir_unop_any:
-      emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
-      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      emit(MOV(result_dst, src_reg(0)));
 
-      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+      inst = emit(MOV(result_dst, src_reg(1)));
       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
       break;
 
    case ir_binop_logic_xor:
-      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      emit(XOR(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_logic_or:
-      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      emit(OR(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_logic_and:
-      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      emit(AND(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_dot:
@@ -1033,52 +1261,54 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_unop_b2f:
    case ir_unop_b2i:
    case ir_unop_f2i:
-      emit(BRW_OPCODE_MOV, result_dst, op[0]);
+      emit(MOV(result_dst, op[0]));
       break;
    case ir_unop_f2b:
    case ir_unop_i2b: {
-      dst_reg temp = result_dst;
-      /* original gen4 does implicit conversion before comparison. */
-      if (intel->gen < 5)
-        temp.type = op[0].type;
-
-      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
+      emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      emit(AND(result_dst, result_src, src_reg(1)));
       break;
    }
 
    case ir_unop_trunc:
-      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
+      emit(RNDZ(result_dst, op[0]));
       break;
    case ir_unop_ceil:
       op[0].negate = !op[0].negate;
-      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      inst = emit(RNDD(result_dst, op[0]));
       this->result.negate = true;
       break;
    case ir_unop_floor:
-      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      inst = emit(RNDD(result_dst, op[0]));
       break;
    case ir_unop_fract:
-      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
+      inst = emit(FRC(result_dst, op[0]));
       break;
    case ir_unop_round_even:
-      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
+      emit(RNDE(result_dst, op[0]));
       break;
 
    case ir_binop_min:
-      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
-      inst->conditional_mod = BRW_CONDITIONAL_L;
+      if (intel->gen >= 6) {
+        inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+        inst->conditional_mod = BRW_CONDITIONAL_L;
+      } else {
+        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
 
-      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+        inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+        inst->predicate = BRW_PREDICATE_NORMAL;
+      }
       break;
    case ir_binop_max:
-      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
-      inst->conditional_mod = BRW_CONDITIONAL_G;
+      if (intel->gen >= 6) {
+        inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+        inst->conditional_mod = BRW_CONDITIONAL_G;
+      } else {
+        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
 
-      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+        inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+        inst->predicate = BRW_PREDICATE_NORMAL;
+      }
       break;
 
    case ir_binop_pow:
@@ -1086,21 +1316,27 @@ vec4_visitor::visit(ir_expression *ir)
       break;
 
    case ir_unop_bit_not:
-      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
+      inst = emit(NOT(result_dst, op[0]));
       break;
    case ir_binop_bit_and:
-      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      inst = emit(AND(result_dst, op[0], op[1]));
       break;
    case ir_binop_bit_xor:
-      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      inst = emit(XOR(result_dst, op[0], op[1]));
       break;
    case ir_binop_bit_or:
-      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      inst = emit(OR(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_lshift:
+      inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
+      break;
+
    case ir_binop_rshift:
-      assert(!"GLSL 1.30 features unsupported");
+      if (ir->type->base_type == GLSL_TYPE_INT)
+        inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
+      else
+        inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
       break;
 
    case ir_quadop_vector:
@@ -1185,7 +1421,6 @@ vec4_visitor::visit(ir_dereference_array *ir)
    if (constant_index) {
       src.reg_offset += constant_index->value.i[0] * element_size;
    } else {
-#if 0 /* Variable array index */
       /* Variable index array dereference.  It eats the "vec4" of the
        * base of the array and an index that offsets the Mesa register
        * index.
@@ -1197,15 +1432,21 @@ vec4_visitor::visit(ir_dereference_array *ir)
       if (element_size == 1) {
         index_reg = this->result;
       } else {
-        index_reg = src_reg(this, glsl_type::float_type);
+        index_reg = src_reg(this, glsl_type::int_type);
+
+        emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
+      }
 
-        emit(BRW_OPCODE_MUL, dst_reg(index_reg),
-             this->result, src_reg_for_float(element_size));
+      if (src.reladdr) {
+        src_reg temp = src_reg(this, glsl_type::int_type);
+
+        emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
+
+        index_reg = temp;
       }
 
       src.reladdr = ralloc(mem_ctx, src_reg);
       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
-#endif
    }
 
    /* If the type is smaller than a vec4, replicate the last channel out. */
@@ -1269,44 +1510,148 @@ get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
 }
 
 void
-vec4_visitor::emit_block_move(ir_assignment *ir)
+vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
+                             const struct glsl_type *type, uint32_t predicate)
 {
-   ir->rhs->accept(this);
-   src_reg src = this->result;
+   if (type->base_type == GLSL_TYPE_STRUCT) {
+      for (unsigned int i = 0; i < type->length; i++) {
+        emit_block_move(dst, src, type->fields.structure[i].type, predicate);
+      }
+      return;
+   }
 
-   dst_reg dst = get_assignment_lhs(ir->lhs, this);
+   if (type->is_array()) {
+      for (unsigned int i = 0; i < type->length; i++) {
+        emit_block_move(dst, src, type->fields.array, predicate);
+      }
+      return;
+   }
+
+   if (type->is_matrix()) {
+      const struct glsl_type *vec_type;
+
+      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
+                                        type->vector_elements, 1);
+
+      for (int i = 0; i < type->matrix_columns; i++) {
+        emit_block_move(dst, src, vec_type, predicate);
+      }
+      return;
+   }
+
+   assert(type->is_scalar() || type->is_vector());
+
+   dst->type = brw_type_for_base_type(type);
+   src->type = dst->type;
+
+   dst->writemask = (1 << type->vector_elements) - 1;
+
+   src->swizzle = swizzle_for_size(type->vector_elements);
+
+   vec4_instruction *inst = emit(MOV(*dst, *src));
+   inst->predicate = predicate;
+
+   dst->reg_offset++;
+   src->reg_offset++;
+}
 
-   /* FINISHME: This should really set to the correct maximal writemask for each
-    * FINISHME: component written (in the loops below).
+
+/* If the RHS processing resulted in an instruction generating a
+ * temporary value, and it would be easy to rewrite the instruction to
+ * generate its result right into the LHS instead, do so.  This ends
+ * up reliably removing instructions where it can be tricky to do so
+ * later without real UD chain information.
+ */
+bool
+vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
+                                    dst_reg dst,
+                                    src_reg src,
+                                    vec4_instruction *pre_rhs_inst,
+                                    vec4_instruction *last_rhs_inst)
+{
+   /* This could be supported, but it would take more smarts. */
+   if (ir->condition)
+      return false;
+
+   if (pre_rhs_inst == last_rhs_inst)
+      return false; /* No instructions generated to work with. */
+
+   /* Make sure the last instruction generated our source reg. */
+   if (src.file != GRF ||
+       src.file != last_rhs_inst->dst.file ||
+       src.reg != last_rhs_inst->dst.reg ||
+       src.reg_offset != last_rhs_inst->dst.reg_offset ||
+       src.reladdr ||
+       src.abs ||
+       src.negate ||
+       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
+      return false;
+
+   /* Check that that last instruction fully initialized the channels
+    * we want to use, in the order we want to use them.  We could
+    * potentially reswizzle the operands of many instructions so that
+    * we could handle out of order channels, but don't yet.
     */
-   dst.writemask = WRITEMASK_XYZW;
 
-   for (int i = 0; i < type_size(ir->lhs->type); i++) {
-      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
-      if (ir->condition)
-        inst->predicate = BRW_PREDICATE_NORMAL;
+   for (unsigned i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i)) {
+        if (!(last_rhs_inst->dst.writemask & (1 << i)))
+           return false;
 
-      dst.reg_offset++;
-      src.reg_offset++;
+        if (BRW_GET_SWZ(src.swizzle, i) != i)
+           return false;
+      }
    }
+
+   /* Success!  Rewrite the instruction. */
+   last_rhs_inst->dst.file = dst.file;
+   last_rhs_inst->dst.reg = dst.reg;
+   last_rhs_inst->dst.reg_offset = dst.reg_offset;
+   last_rhs_inst->dst.reladdr = dst.reladdr;
+   last_rhs_inst->dst.writemask &= dst.writemask;
+
+   return true;
 }
 
 void
 vec4_visitor::visit(ir_assignment *ir)
 {
+   dst_reg dst = get_assignment_lhs(ir->lhs, this);
+   uint32_t predicate = BRW_PREDICATE_NONE;
+
    if (!ir->lhs->type->is_scalar() &&
        !ir->lhs->type->is_vector()) {
-      emit_block_move(ir);
+      ir->rhs->accept(this);
+      src_reg src = this->result;
+
+      if (ir->condition) {
+        emit_bool_to_cond_code(ir->condition, &predicate);
+      }
+
+      /* emit_block_move doesn't account for swizzles in the source register.
+       * This should be ok, since the source register is a structure or an
+       * array, and those can't be swizzled.  But double-check to be sure.
+       */
+      assert(src.swizzle ==
+             (ir->rhs->type->is_matrix()
+              ? swizzle_for_size(ir->rhs->type->vector_elements)
+              : BRW_SWIZZLE_NOOP));
+
+      emit_block_move(&dst, &src, ir->rhs->type, predicate);
       return;
    }
 
    /* Now we're down to just a scalar/vector with writemasks. */
    int i;
 
+   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
+   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
+
    ir->rhs->accept(this);
-   src_reg src = this->result;
 
-   dst_reg dst = get_assignment_lhs(ir->lhs, this);
+   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
+
+   src_reg src = this->result;
 
    int swizzles[4];
    int first_enabled_chan = 0;
@@ -1338,111 +1683,113 @@ vec4_visitor::visit(ir_assignment *ir)
    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
                              swizzles[2], swizzles[3]);
 
+   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
+      return;
+   }
+
    if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
+      emit_bool_to_cond_code(ir->condition, &predicate);
    }
 
    for (i = 0; i < type_size(ir->lhs->type); i++) {
-      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
-
-      if (ir->condition)
-        inst->predicate = BRW_PREDICATE_NORMAL;
+      vec4_instruction *inst = emit(MOV(dst, src));
+      inst->predicate = predicate;
 
       dst.reg_offset++;
       src.reg_offset++;
    }
 }
 
-
 void
-vec4_visitor::visit(ir_constant *ir)
+vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
 {
    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
-      src_reg temp_base = src_reg(this, ir->type);
-      dst_reg temp = dst_reg(temp_base);
-
-      foreach_iter(exec_list_iterator, iter, ir->components) {
-        ir_constant *field_value = (ir_constant *)iter.get();
-        int size = type_size(field_value->type);
+      foreach_list(node, &ir->components) {
+        ir_constant *field_value = (ir_constant *)node;
 
-        assert(size > 0);
-
-        field_value->accept(this);
-        src_reg src = this->result;
-
-        for (int i = 0; i < (unsigned int)size; i++) {
-           emit(BRW_OPCODE_MOV, temp, src);
-
-           src.reg_offset++;
-           temp.reg_offset++;
-        }
+        emit_constant_values(dst, field_value);
       }
-      this->result = temp_base;
       return;
    }
 
    if (ir->type->is_array()) {
-      src_reg temp_base = src_reg(this, ir->type);
-      dst_reg temp = dst_reg(temp_base);
-      int size = type_size(ir->type->fields.array);
-
-      assert(size > 0);
-
       for (unsigned int i = 0; i < ir->type->length; i++) {
-        ir->array_elements[i]->accept(this);
-        src_reg src = this->result;
-        for (int j = 0; j < size; j++) {
-           emit(BRW_OPCODE_MOV, temp, src);
-
-           src.reg_offset++;
-           temp.reg_offset++;
-        }
+        emit_constant_values(dst, ir->array_elements[i]);
       }
-      this->result = temp_base;
       return;
    }
 
    if (ir->type->is_matrix()) {
-      this->result = src_reg(this, ir->type);
-      dst_reg dst = dst_reg(this->result);
-
-      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
-
       for (int i = 0; i < ir->type->matrix_columns; i++) {
+        float *vec = &ir->value.f[i * ir->type->vector_elements];
+
         for (int j = 0; j < ir->type->vector_elements; j++) {
-           dst.writemask = 1 << j;
-           emit(BRW_OPCODE_MOV, dst,
-                src_reg(ir->value.f[i * ir->type->vector_elements + j]));
+           dst->writemask = 1 << j;
+           dst->type = BRW_REGISTER_TYPE_F;
+
+           emit(MOV(*dst, src_reg(vec[j])));
         }
-        dst.reg_offset++;
+        dst->reg_offset++;
       }
       return;
    }
 
-   this->result = src_reg(this, ir->type);
-   dst_reg dst = dst_reg(this->result);
+   int remaining_writemask = (1 << ir->type->vector_elements) - 1;
 
    for (int i = 0; i < ir->type->vector_elements; i++) {
-      dst.writemask = 1 << i;
+      if (!(remaining_writemask & (1 << i)))
+        continue;
+
+      dst->writemask = 1 << i;
+      dst->type = brw_type_for_base_type(ir->type);
+
+      /* Find other components that match the one we're about to
+       * write.  Emits fewer instructions for things like vec4(0.5,
+       * 1.5, 1.5, 1.5).
+       */
+      for (int j = i + 1; j < ir->type->vector_elements; j++) {
+        if (ir->type->base_type == GLSL_TYPE_BOOL) {
+           if (ir->value.b[i] == ir->value.b[j])
+              dst->writemask |= (1 << j);
+        } else {
+           /* u, i, and f storage all line up, so no need for a
+            * switch case for comparing each type.
+            */
+           if (ir->value.u[i] == ir->value.u[j])
+              dst->writemask |= (1 << j);
+        }
+      }
 
       switch (ir->type->base_type) {
       case GLSL_TYPE_FLOAT:
-        emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.f[i]));
+        emit(MOV(*dst, src_reg(ir->value.f[i])));
         break;
       case GLSL_TYPE_INT:
-        emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.i[i]));
+        emit(MOV(*dst, src_reg(ir->value.i[i])));
         break;
       case GLSL_TYPE_UINT:
-        emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.u[i]));
+        emit(MOV(*dst, src_reg(ir->value.u[i])));
         break;
       case GLSL_TYPE_BOOL:
-        emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.b[i]));
+        emit(MOV(*dst, src_reg(ir->value.b[i])));
         break;
       default:
         assert(!"Non-float/uint/int/bool constant");
         break;
       }
+
+      remaining_writemask &= ~dst->writemask;
    }
+   dst->reg_offset++;
+}
+
+void
+vec4_visitor::visit(ir_constant *ir)
+{
+   dst_reg dst = dst_reg(this, ir->type);
+   this->result = src_reg(dst);
+
+   emit_constant_values(&dst, ir);
 }
 
 void
@@ -1454,7 +1801,178 @@ vec4_visitor::visit(ir_call *ir)
 void
 vec4_visitor::visit(ir_texture *ir)
 {
-   assert(!"not reached");
+   int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
+   sampler = vp->Base.SamplerUnits[sampler];
+
+   /* Should be lowered by do_lower_texture_projection */
+   assert(!ir->projector);
+
+   vec4_instruction *inst = NULL;
+   switch (ir->op) {
+   case ir_tex:
+   case ir_txl:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
+      break;
+   case ir_txd:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
+      break;
+   case ir_txf:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
+      break;
+   case ir_txs:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
+      break;
+   case ir_txb:
+      assert(!"TXB is not valid for vertex shaders.");
+   }
+
+   /* Texel offsets go in the message header; Gen4 also requires headers. */
+   inst->header_present = ir->offset || intel->gen < 5;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_present + 1; /* always at least one */
+   inst->sampler = sampler;
+   inst->dst = dst_reg(this, ir->type);
+   inst->shadow_compare = ir->shadow_comparitor != NULL;
+
+   if (ir->offset != NULL)
+      inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_present;
+
+   if (ir->op == ir_txs) {
+      ir->lod_info.lod->accept(this);
+      int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
+          this->result));
+   } else {
+      int i, coord_mask = 0, zero_mask = 0;
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      for (i = 0; i < ir->coordinate->type->vector_elements; i++)
+        coord_mask |= (1 << i);
+      for (; i < 4; i++)
+        zero_mask |= (1 << i);
+
+      ir->coordinate->accept(this);
+      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
+              this->result));
+      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
+              src_reg(0)));
+      /* Load the shadow comparitor */
+      if (ir->shadow_comparitor) {
+        ir->shadow_comparitor->accept(this);
+        emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
+                         WRITEMASK_X),
+                 this->result));
+        inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      if (ir->op == ir_txl) {
+        int mrf, writemask;
+        if (intel->gen >= 5) {
+           mrf = param_base + 1;
+           if (ir->shadow_comparitor) {
+              writemask = WRITEMASK_Y;
+              /* mlen already incremented */
+           } else {
+              writemask = WRITEMASK_X;
+              inst->mlen++;
+           }
+        } else /* intel->gen == 4 */ {
+           mrf = param_base;
+           writemask = WRITEMASK_Z;
+        }
+        ir->lod_info.lod->accept(this);
+        emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
+                 this->result));
+      } else if (ir->op == ir_txf) {
+        ir->lod_info.lod->accept(this);
+        emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
+                 this->result));
+      } else if (ir->op == ir_txd) {
+        const glsl_type *type = ir->lod_info.grad.dPdx->type;
+
+        ir->lod_info.grad.dPdx->accept(this);
+        src_reg dPdx = this->result;
+        ir->lod_info.grad.dPdy->accept(this);
+        src_reg dPdy = this->result;
+
+        if (intel->gen >= 5) {
+           dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+           dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
+           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
+           inst->mlen++;
+
+           if (ir->type->vector_elements == 3) {
+              dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
+              dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
+              emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
+              emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
+              inst->mlen++;
+           }
+        } else /* intel->gen == 4 */ {
+           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
+           emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
+           inst->mlen += 2;
+        }
+      }
+   }
+
+   emit(inst);
+
+   swizzle_result(ir, src_reg(inst->dst), sampler);
+}
+
+void
+vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
+{
+   this->result = orig_val;
+
+   int s = c->key.tex.swizzles[sampler];
+
+   if (ir->op == ir_txs || ir->type == glsl_type::float_type
+                       || s == SWIZZLE_NOOP)
+      return;
+
+   int zero_mask = 0, one_mask = 0, copy_mask = 0;
+   int swizzle[4];
+
+   for (int i = 0; i < 4; i++) {
+      switch (GET_SWZ(s, i)) {
+      case SWIZZLE_ZERO:
+        zero_mask |= (1 << i);
+        break;
+      case SWIZZLE_ONE:
+        one_mask |= (1 << i);
+        break;
+      default:
+        copy_mask |= (1 << i);
+        swizzle[i] = GET_SWZ(s, i);
+        break;
+      }
+   }
+
+   this->result = src_reg(this, ir->type);
+   dst_reg swizzled_result(this->result);
+
+   if (copy_mask) {
+      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+      swizzled_result.writemask = copy_mask;
+      emit(MOV(swizzled_result, orig_val));
+   }
+
+   if (zero_mask) {
+      swizzled_result.writemask = zero_mask;
+      emit(MOV(swizzled_result, src_reg(0.0f)));
+   }
+
+   if (one_mask) {
+      swizzled_result.writemask = one_mask;
+      emit(MOV(swizzled_result, src_reg(1.0f)));
+   }
 }
 
 void
@@ -1480,9 +1998,9 @@ vec4_visitor::visit(ir_if *ir)
    if (intel->gen == 6) {
       emit_if_gen6(ir);
    } else {
-      emit_bool_to_cond_code(ir->condition);
-      vec4_instruction *inst = emit(BRW_OPCODE_IF);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+      uint32_t predicate;
+      emit_bool_to_cond_code(ir->condition, &predicate);
+      emit(IF(predicate));
    }
 
    visit_instructions(&ir->then_instructions);
@@ -1498,14 +2016,15 @@ vec4_visitor::visit(ir_if *ir)
    emit(BRW_OPCODE_ENDIF);
 }
 
-int
-vec4_visitor::emit_vue_header_gen4(int header_mrf)
+void
+vec4_visitor::emit_ndc_computation()
 {
    /* Get the position */
    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
 
    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
+   output_reg[BRW_VERT_RESULT_NDC] = ndc;
 
    current_annotation = "NDC";
    dst_reg ndc_w = ndc;
@@ -1517,32 +2036,39 @@ vec4_visitor::emit_vue_header_gen4(int header_mrf)
    dst_reg ndc_xyz = ndc;
    ndc_xyz.writemask = WRITEMASK_XYZ;
 
-   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
+   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
+}
 
-   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || brw->has_negative_rhw_bug) {
+void
+vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
+{
+   if (intel->gen < 6 &&
+       ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+        c->key.userclip_active || brw->has_negative_rhw_bug)) {
       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
+      dst_reg header1_w = header1;
+      header1_w.writemask = WRITEMASK_W;
       GLuint i;
 
-      emit(BRW_OPCODE_MOV, header1, 0u);
+      emit(MOV(header1, 0u));
 
       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
-        assert(!"finishme: psiz");
-        src_reg psiz;
+        src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
 
-        header1.writemask = WRITEMASK_W;
-        emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
-        emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
+        current_annotation = "Point size";
+        emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
+        emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
       }
 
-      for (i = 0; i < c->key.nr_userclip; i++) {
+      current_annotation = "Clipping flags";
+      for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
         vec4_instruction *inst;
 
-        inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
-                    pos, src_reg(c->userplane[i]));
+        inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
+                         src_reg(this->userplane[i])));
         inst->conditional_mod = BRW_CONDITIONAL_L;
 
-        emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
+        inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
         inst->predicate = BRW_PREDICATE_NORMAL;
       }
 
@@ -1561,104 +2087,120 @@ vec4_visitor::emit_vue_header_gen4(int header_mrf)
         brw_CMP(p,
                 vec8(brw_null_reg()),
                 BRW_CONDITIONAL_L,
-                brw_swizzle1(ndc, 3),
+                brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
                 brw_imm_f(0));
 
         brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
-        brw_MOV(p, ndc, brw_imm_f(0));
+        brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 #endif
       }
 
-      header1.writemask = WRITEMASK_XYZW;
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
-   } else {
-      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
-                                 BRW_REGISTER_TYPE_UD), 0u);
-   }
-
-   if (intel->gen == 5) {
-      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
-       * dword 0-3 (m1) of the header is indices, point width, clip flags.
-       * dword 4-7 (m2) is the ndc position (set above)
-       * dword 8-11 (m3) of the vertex header is the 4D space position
-       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
-       * m6 is a pad so that the vertex element data is aligned
-       * m7 is the first vertex data we fill.
-       */
-      current_annotation = "NDC";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
-
-      current_annotation = "gl_Position";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
-
-      /* user clip distance. */
-      header_mrf += 2;
-
-      /* Pad so that vertex element data is aligned. */
-      header_mrf++;
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
+   } else if (intel->gen < 6) {
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
    } else {
-      /* There are 8 dwords in VUE header pre-Ironlake:
-       * dword 0-3 (m1) is indices, point width, clip flags.
-       * dword 4-7 (m2) is ndc position (set above)
-       *
-       * dword 8-11 (m3) is the first vertex data.
-       */
-      current_annotation = "NDC";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
-
-      current_annotation = "gl_Position";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+         emit(MOV(brw_writemask(reg, WRITEMASK_W),
+                  src_reg(output_reg[VERT_RESULT_PSIZ])));
+      }
    }
-
-   return header_mrf;
 }
 
-int
-vec4_visitor::emit_vue_header_gen6(int header_mrf)
+void
+vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
 {
-   struct brw_reg reg;
+   if (intel->gen < 6) {
+      /* Clip distance slots are set aside in gen5, but they are not used.  It
+       * is not clear whether we actually need to set aside space for them,
+       * but the performance cost is negligible.
+       */
+      return;
+   }
 
-   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
-    * dword 0-3 (m2) of the header is indices, point width, clip flags.
-    * dword 4-7 (m3) is the 4D space position
-    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
-    * enabled.
+   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
+    *
+    *     "If a linked set of shaders forming the vertex stage contains no
+    *     static write to gl_ClipVertex or gl_ClipDistance, but the
+    *     application has requested clipping against user clip planes through
+    *     the API, then the coordinate written to gl_Position is used for
+    *     comparison against the user clip planes."
     *
-    * m4 or 6 is the first vertex element data we fill.
+    * This function is only called if the shader didn't write to
+    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
+    * if the user wrote to it; otherwise we use gl_Position.
     */
+   gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
+   if (!(c->prog_data.outputs_written
+         & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
+      clip_vertex = VERT_RESULT_HPOS;
+   }
 
-   current_annotation = "indices, point width, clip flags";
-   reg = brw_message_reg(header_mrf++);
-   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
-   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
-      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
-          src_reg(output_reg[VERT_RESULT_PSIZ]));
-   }
-
-   current_annotation = "gl_Position";
-   emit(BRW_OPCODE_MOV,
-       brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
-
-   current_annotation = "user clip distances";
-   if (c->key.nr_userclip) {
-      for (int i = 0; i < c->key.nr_userclip; i++) {
-        struct brw_reg m;
-        if (i < 4)
-           m = brw_message_reg(header_mrf);
-        else
-           m = brw_message_reg(header_mrf + 1);
-
-        emit(BRW_OPCODE_DP4,
-             dst_reg(brw_writemask(m, 1 << (i & 3))),
-             src_reg(c->userplane[i]));
-      }
-      header_mrf += 2;
+   for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
+        ++i) {
+      emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
+               src_reg(output_reg[clip_vertex]),
+               src_reg(this->userplane[i + offset])));
    }
+}
 
-   current_annotation = NULL;
+void
+vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
+{
+   assert (vert_result < VERT_RESULT_MAX);
+   reg.type = output_reg[vert_result].type;
+   current_annotation = output_reg_annotation[vert_result];
+   /* Copy the register, saturating if necessary */
+   vec4_instruction *inst = emit(MOV(reg,
+                                     src_reg(output_reg[vert_result])));
+   if ((vert_result == VERT_RESULT_COL0 ||
+        vert_result == VERT_RESULT_COL1 ||
+        vert_result == VERT_RESULT_BFC0 ||
+        vert_result == VERT_RESULT_BFC1) &&
+       c->key.clamp_vertex_color) {
+      inst->saturate = true;
+   }
+}
 
-   return header_mrf;
+void
+vec4_visitor::emit_urb_slot(int mrf, int vert_result)
+{
+   struct brw_reg hw_reg = brw_message_reg(mrf);
+   dst_reg reg = dst_reg(MRF, mrf);
+   reg.type = BRW_REGISTER_TYPE_F;
+
+   switch (vert_result) {
+   case VERT_RESULT_PSIZ:
+      /* PSIZ is always in slot 0, and is coupled with other flags. */
+      current_annotation = "indices, point width, clip flags";
+      emit_psiz_and_flags(hw_reg);
+      break;
+   case BRW_VERT_RESULT_NDC:
+      current_annotation = "NDC";
+      emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
+      break;
+   case BRW_VERT_RESULT_HPOS_DUPLICATE:
+   case VERT_RESULT_HPOS:
+      current_annotation = "gl_Position";
+      emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
+      break;
+   case VERT_RESULT_CLIP_DIST0:
+   case VERT_RESULT_CLIP_DIST1:
+      if (this->c->key.uses_clip_distance) {
+         emit_generic_urb_slot(reg, vert_result);
+      } else {
+         current_annotation = "user clip distances";
+         emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
+      }
+      break;
+   case BRW_VERT_RESULT_PAD:
+      /* No need to write to this slot */
+      break;
+   default:
+      emit_generic_urb_slot(reg, vert_result);
+      break;
+   }
 }
 
 static int
@@ -1691,67 +2233,349 @@ align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
 void
 vec4_visitor::emit_urb_writes()
 {
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
    int base_mrf = 1;
    int mrf = base_mrf;
-   int urb_entry_size;
+   /* In the process of generating our URB write message contents, we
+    * may need to unspill a register or load from an array.  Those
+    * reads would use MRFs 14-15.
+    */
+   int max_usable_mrf = 13;
+
+   /* The following assertion verifies that max_usable_mrf causes an
+    * even-numbered amount of URB write data, which will meet gen6's
+    * requirements for length alignment.
+    */
+   assert ((max_usable_mrf - base_mrf) % 2 == 0);
 
    /* FINISHME: edgeflag */
 
+   brw_compute_vue_map(&c->vue_map, intel, &c->prog_data);
+
    /* First mrf is the g0-based message header containing URB handles and such,
     * which is implied in VS_OPCODE_URB_WRITE.
     */
    mrf++;
 
-   if (intel->gen >= 6) {
-      mrf = emit_vue_header_gen6(mrf);
-   } else {
-      mrf = emit_vue_header_gen4(mrf);
+   if (intel->gen < 6) {
+      emit_ndc_computation();
    }
 
-   int attr;
-   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
-      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
-        continue;
-
-      /* This is set up in the VUE header. */
-      if (attr == VERT_RESULT_HPOS)
-        continue;
-
-      /* This is loaded into the VUE header, and thus doesn't occupy
-       * an attribute slot.
-       */
-      if (attr == VERT_RESULT_PSIZ)
-        continue;
-
-      emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
+   /* Set up the VUE data for the first URB write */
+   int slot;
+   for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
+      emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
 
-      /* If this is MRF 15, we can't fit anything more into this URB
-       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
-       * even-numbered amount of URB write data, which will meet
-       * gen6's requirements for length alignment.
+      /* If this was max_usable_mrf, we can't fit anything more into this URB
+       * WRITE.
        */
-      if (mrf == 15)
+      if (mrf > max_usable_mrf) {
+        slot++;
         break;
+      }
    }
 
+   current_annotation = "URB write";
    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
    inst->base_mrf = base_mrf;
    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
-   inst->eot = true;
+   inst->eot = (slot >= c->vue_map.num_slots);
 
-   urb_entry_size = mrf - base_mrf;
+   /* Optional second URB write */
+   if (!inst->eot) {
+      mrf = base_mrf + 1;
 
-   for (; attr < VERT_RESULT_MAX; attr++) {
-      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
-        continue;
-      fail("Second URB write not supported.\n");
-      break;
+      for (; slot < c->vue_map.num_slots; ++slot) {
+        assert(mrf < max_usable_mrf);
+
+         emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
+      }
+
+      current_annotation = "URB write";
+      inst = emit(VS_OPCODE_URB_WRITE);
+      inst->base_mrf = base_mrf;
+      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
+      inst->eot = true;
+      /* URB destination offset.  In the previous write, we got MRFs
+       * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
+       * URB row increments, and each of our MRFs is half of one of
+       * those, since we're doing interleaved writes.
+       */
+      inst->offset = (max_usable_mrf - base_mrf) / 2;
    }
+}
 
-   if (intel->gen == 6)
-      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
-   else
-      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
+src_reg
+vec4_visitor::get_scratch_offset(vec4_instruction *inst,
+                                src_reg *reladdr, int reg_offset)
+{
+   /* Because we store the values to scratch interleaved like our
+    * vertex data, we need to scale the vec4 index by 2.
+    */
+   int message_header_scale = 2;
+
+   /* Pre-gen6, the message header uses byte offsets instead of vec4
+    * (16-byte) offset units.
+    */
+   if (intel->gen < 6)
+      message_header_scale *= 16;
+
+   if (reladdr) {
+      src_reg index = src_reg(this, glsl_type::int_type);
+
+      emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
+      emit_before(inst, MUL(dst_reg(index),
+                           index, src_reg(message_header_scale)));
+
+      return index;
+   } else {
+      return src_reg(reg_offset * message_header_scale);
+   }
+}
+
+src_reg
+vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
+                                      src_reg *reladdr, int reg_offset)
+{
+   if (reladdr) {
+      src_reg index = src_reg(this, glsl_type::int_type);
+
+      emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
+
+      /* Pre-gen6, the message header uses byte offsets instead of vec4
+       * (16-byte) offset units.
+       */
+      if (intel->gen < 6) {
+        emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
+      }
+
+      return index;
+   } else {
+      int message_header_scale = intel->gen < 6 ? 16 : 1;
+      return src_reg(reg_offset * message_header_scale);
+   }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from scratch space at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_scratch_read(vec4_instruction *inst,
+                               dst_reg temp, src_reg orig_src,
+                               int base_offset)
+{
+   int reg_offset = base_offset + orig_src.reg_offset;
+   src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
+
+   emit_before(inst, SCRATCH_READ(temp, index));
+}
+
+/**
+ * Emits an instruction after @inst to store the value to be written
+ * to @orig_dst to scratch space at @base_offset, from @temp.
+ */
+void
+vec4_visitor::emit_scratch_write(vec4_instruction *inst,
+                                src_reg temp, dst_reg orig_dst,
+                                int base_offset)
+{
+   int reg_offset = base_offset + orig_dst.reg_offset;
+   src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
+
+   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+                                      orig_dst.writemask));
+   vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
+   write->predicate = inst->predicate;
+   write->ir = inst->ir;
+   write->annotation = inst->annotation;
+   inst->insert_after(write);
+}
+
+/**
+ * We can't generally support array access in GRF space, because a
+ * single instruction's destination can only span 2 contiguous
+ * registers.  So, we send all GRF arrays that get variable index
+ * access to scratch space.
+ */
+void
+vec4_visitor::move_grf_array_access_to_scratch()
+{
+   int scratch_loc[this->virtual_grf_count];
+
+   for (int i = 0; i < this->virtual_grf_count; i++) {
+      scratch_loc[i] = -1;
+   }
+
+   /* First, calculate the set of virtual GRFs that need to be punted
+    * to scratch due to having any array access on them, and where in
+    * scratch.
+    */
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      if (inst->dst.file == GRF && inst->dst.reladdr &&
+         scratch_loc[inst->dst.reg] == -1) {
+        scratch_loc[inst->dst.reg] = c->last_scratch;
+        c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+        src_reg *src = &inst->src[i];
+
+        if (src->file == GRF && src->reladdr &&
+            scratch_loc[src->reg] == -1) {
+           scratch_loc[src->reg] = c->last_scratch;
+           c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
+        }
+      }
+   }
+
+   /* Now, for anything that will be accessed through scratch, rewrite
+    * it to load/store.  Note that this is a _safe list walk, because
+    * we may generate a new scratch_write instruction after the one
+    * we're processing.
+    */
+   foreach_list_safe(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      /* Set up the annotation tracking for new generated instructions. */
+      base_ir = inst->ir;
+      current_annotation = inst->annotation;
+
+      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
+        src_reg temp = src_reg(this, glsl_type::vec4_type);
+
+        emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
+
+        inst->dst.file = temp.file;
+        inst->dst.reg = temp.reg;
+        inst->dst.reg_offset = temp.reg_offset;
+        inst->dst.reladdr = NULL;
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+        if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
+           continue;
+
+        dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+
+        emit_scratch_read(inst, temp, inst->src[i],
+                          scratch_loc[inst->src[i].reg]);
+
+        inst->src[i].file = temp.file;
+        inst->src[i].reg = temp.reg;
+        inst->src[i].reg_offset = temp.reg_offset;
+        inst->src[i].reladdr = NULL;
+      }
+   }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from the pull constant buffer (surface) at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
+                                     dst_reg temp, src_reg orig_src,
+                                     int base_offset)
+{
+   int reg_offset = base_offset + orig_src.reg_offset;
+   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
+   vec4_instruction *load;
+
+   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
+                                       temp, index);
+   load->base_mrf = 14;
+   load->mlen = 1;
+   emit_before(inst, load);
+}
+
+/**
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants.  In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+vec4_visitor::move_uniform_array_access_to_pull_constants()
+{
+   int pull_constant_loc[this->uniforms];
+
+   for (int i = 0; i < this->uniforms; i++) {
+      pull_constant_loc[i] = -1;
+   }
+
+   /* Walk through and find array access of uniforms.  Put a copy of that
+    * uniform in the pull constant buffer.
+    *
+    * Note that we don't move constant-indexed accesses to arrays.  No
+    * testing has been done of the performance impact of this choice.
+    */
+   foreach_list_safe(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      for (int i = 0 ; i < 3; i++) {
+        if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+           continue;
+
+        int uniform = inst->src[i].reg;
+
+        /* If this array isn't already present in the pull constant buffer,
+         * add it.
+         */
+        if (pull_constant_loc[uniform] == -1) {
+           const float **values = &prog_data->param[uniform * 4];
+
+           pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
+
+           for (int j = 0; j < uniform_size[uniform] * 4; j++) {
+              prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
+           }
+        }
+
+        /* Set up the annotation tracking for new generated instructions. */
+        base_ir = inst->ir;
+        current_annotation = inst->annotation;
+
+        dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+
+        emit_pull_constant_load(inst, temp, inst->src[i],
+                                pull_constant_loc[uniform]);
+
+        inst->src[i].file = temp.file;
+        inst->src[i].reg = temp.reg;
+        inst->src[i].reg_offset = temp.reg_offset;
+        inst->src[i].reladdr = NULL;
+      }
+   }
+
+   /* Now there are no accesses of the UNIFORM file with a reladdr, so
+    * no need to track them as larger-than-vec4 objects.  This will be
+    * relied on in cutting out unused uniform vectors from push
+    * constants.
+    */
+   split_uniform_registers();
+}
+
+void
+vec4_visitor::resolve_ud_negate(src_reg *reg)
+{
+   if (reg->type != BRW_REGISTER_TYPE_UD ||
+       !reg->negate)
+      return;
+
+   src_reg temp = src_reg(this, glsl_type::uvec4_type);
+   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
+   *reg = temp;
 }
 
 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
@@ -1773,26 +2597,31 @@ vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
    this->current_annotation = NULL;
 
    this->c = c;
-   this->vp = brw->vertex_program; /* FINISHME: change for precompile */
+   this->vp = (struct gl_vertex_program *)
+     prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
    this->prog_data = &c->prog_data;
 
    this->variable_ht = hash_table_ctor(0,
                                       hash_table_pointer_hash,
                                       hash_table_pointer_compare);
 
+   this->virtual_grf_def = NULL;
+   this->virtual_grf_use = NULL;
    this->virtual_grf_sizes = NULL;
    this->virtual_grf_count = 0;
+   this->virtual_grf_reg_map = NULL;
+   this->virtual_grf_reg_count = 0;
    this->virtual_grf_array_size = 0;
+   this->live_intervals_valid = false;
 
-   this->uniforms = 0;
+   this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
 
-   this->variable_ht = hash_table_ctor(0,
-                                      hash_table_pointer_hash,
-                                      hash_table_pointer_compare);
+   this->uniforms = 0;
 }
 
 vec4_visitor::~vec4_visitor()
 {
+   ralloc_free(this->mem_ctx);
    hash_table_dtor(this->variable_ht);
 }