X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_vec4_visitor.cpp;h=f9a08a011f28fb13ca39d90c0f942078aac8f98d;hb=f0cecd43d6b6d3f5def3fd43b9c95baaf3be9b16;hp=f9447d7c39183f1206a681faf57171f31913c3e8;hpb=abf843a797876b5e3c5c91dbec25b6553d2cc281;p=mesa.git

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index f9447d7c391..f9a08a011f2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -25,6 +25,7 @@
 extern "C" {
 #include "main/macros.h"
 #include "program/prog_parameter.h"
+#include "program/sampler.h"
 }
 
 namespace brw {
@@ -38,6 +39,7 @@ src_reg::src_reg(dst_reg reg)
    this->reg_offset = reg.reg_offset;
    this->type = reg.type;
    this->reladdr = reg.reladdr;
+   this->fixed_hw_reg = reg.fixed_hw_reg;
 
    int swizzles[4];
    int next_chan = 0;
@@ -68,45 +70,182 @@ dst_reg::dst_reg(src_reg reg)
    this->type = reg.type;
    this->writemask = WRITEMASK_XYZW;
    this->reladdr = reg.reladdr;
+   this->fixed_hw_reg = reg.fixed_hw_reg;
+}
+
+vec4_instruction::vec4_instruction(vec4_visitor *v,
+				   enum opcode opcode, dst_reg dst,
+				   src_reg src0, src_reg src1, src_reg src2)
+{
+   this->opcode = opcode;
+   this->dst = dst;
+   this->src[0] = src0;
+   this->src[1] = src1;
+   this->src[2] = src2;
+   this->ir = v->base_ir;
+   this->annotation = v->current_annotation;
 }
 
 vec4_instruction *
-vec4_visitor::emit(enum opcode opcode, dst_reg dst,
-		   src_reg src0, src_reg src1, src_reg src2)
+vec4_visitor::emit(vec4_instruction *inst)
 {
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction();
+   this->instructions.push_tail(inst);
 
-   inst->opcode = opcode;
-   inst->dst = dst;
-   inst->src[0] = src0;
-   inst->src[1] = src1;
-   inst->src[2] = src2;
-   inst->ir = this->base_ir;
-   inst->annotation = this->current_annotation;
+   return inst;
+}
 
-   this->instructions.push_tail(inst);
+vec4_instruction *
+vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
+{
+   new_inst->ir = inst->ir;
+   new_inst->annotation = inst->annotation;
+
+   inst->insert_before(new_inst);
 
    return inst;
 }
 
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst,
+		   src_reg src0, src_reg src1, src_reg src2)
+{
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
+					     src0, src1, src2));
+}
+
 
 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 {
-   return emit(opcode, dst, src0, src1, src_reg());
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 }
 
 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 {
-   assert(dst.writemask != 0);
-   return emit(opcode, dst, src0, src_reg(), src_reg());
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 }
 
 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode)
 {
-   return emit(opcode, dst_reg(), src_reg(), src_reg(), src_reg());
+   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
+}
+
+#define ALU1(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(dst_reg dst, src_reg src0)				\
+   {									\
+      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
+					   src0);			\
+   }
+
+#define ALU2(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
+   {									\
+      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
+					   src0, src1);			\
+   }
+
+ALU1(NOT)
+ALU1(MOV)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
+ALU2(ADD)
+ALU2(MUL)
+ALU2(MACH)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(DP3)
+ALU2(DP4)
+
+/** Gen4 predicated IF. */
+vec4_instruction *
+vec4_visitor::IF(uint32_t predicate)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
+   inst->predicate = predicate;
+
+   return inst;
+}
+
+/** Gen6+ IF with embedded comparison. */
+vec4_instruction *
+vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
+{
+   assert(intel->gen >= 6);
+
+   vec4_instruction *inst;
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
+					src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+/**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+vec4_instruction *
+vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
+{
+   vec4_instruction *inst;
+
+   /* original gen4 does type conversion to the destination type
+    * before before comparison, producing garbage results for floating
+    * point comparisons.
+    */
+   if (intel->gen == 4) {
+      dst.type = src0.type;
+      if (dst.file == HW_REG)
+	 dst.fixed_hw_reg.type = dst.type;
+   }
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
+					dst, index);
+   inst->base_mrf = 14;
+   inst->mlen = 1;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
+					dst, src, index);
+   inst->base_mrf = 13;
+   inst->mlen = 2;
+
+   return inst;
 }
 
 void
@@ -125,9 +264,15 @@ vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
    /* The gen6 math instruction ignores the source modifiers --
     * swizzle, abs, negate, and at least some parts of the register
     * region description.
+    *
+    * While it would seem that this MOV could be avoided at this point
+    * in the case that the swizzle is matched up with the destination
+    * writemask, note that uniform packing and register allocation
+    * could rearrange our swizzle, so let's leave this matter up to
+    * copy propagation later.
     */
    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
-   emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
+   emit(MOV(dst_reg(temp_src), src));
 
    if (dst.writemask != WRITEMASK_XYZW) {
       /* The gen6 math instruction must be align1, so we can't do
@@ -137,7 +282,7 @@ vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 
       emit(opcode, temp_dst, temp_src);
 
-      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
+      emit(MOV(dst, src_reg(temp_dst)));
    } else {
       emit(opcode, dst, temp_src);
    }
@@ -168,7 +313,9 @@ vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
       return;
    }
 
-   if (intel->gen >= 6) {
+   if (intel->gen >= 7) {
+      emit(opcode, dst, src);
+   } else if (intel->gen == 6) {
       return emit_math1_gen6(opcode, dst, src);
    } else {
       return emit_math1_gen4(opcode, dst, src);
@@ -188,11 +335,13 @@ vec4_visitor::emit_math2_gen6(enum opcode opcode,
     */
 
    expanded = src_reg(this, glsl_type::vec4_type);
-   emit(BRW_OPCODE_MOV, dst_reg(expanded), src0);
+   expanded.type = src0.type;
+   emit(MOV(dst_reg(expanded), src0));
    src0 = expanded;
 
    expanded = src_reg(this, glsl_type::vec4_type);
-   emit(BRW_OPCODE_MOV, dst_reg(expanded), src1);
+   expanded.type = src1.type;
+   emit(MOV(dst_reg(expanded), src1));
    src1 = expanded;
 
    if (dst.writemask != WRITEMASK_XYZW) {
@@ -200,10 +349,11 @@ vec4_visitor::emit_math2_gen6(enum opcode opcode,
        * writemasks.
        */
       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
+      temp_dst.type = dst.type;
 
       emit(opcode, temp_dst, src0, src1);
 
-      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
+      emit(MOV(dst, src_reg(temp_dst)));
    } else {
       emit(opcode, dst, src0, src1);
    }
@@ -222,9 +372,19 @@ void
 vec4_visitor::emit_math(enum opcode opcode,
 			dst_reg dst, src_reg src0, src_reg src1)
 {
-   assert(opcode == SHADER_OPCODE_POW);
+   switch (opcode) {
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      break;
+   default:
+      assert(!"not reached: unsupported binary math opcode");
+      return;
+   }
 
-   if (intel->gen >= 6) {
+   if (intel->gen >= 7) {
+      emit(opcode, dst, src0, src1);
+   } else if (intel->gen == 6) {
       return emit_math2_gen6(opcode, dst, src0, src1);
    } else {
       return emit_math2_gen4(opcode, dst, src0, src1);
@@ -234,8 +394,8 @@ vec4_visitor::emit_math(enum opcode opcode,
 void
 vec4_visitor::visit_instructions(const exec_list *list)
 {
-   foreach_iter(exec_list_iterator, iter, *list) {
-      ir_instruction *ir = (ir_instruction *)iter.get();
+   foreach_list(node, list) {
+      ir_instruction *ir = (ir_instruction *)node;
 
       base_ir = ir;
       ir->accept(this);
@@ -294,7 +454,11 @@ vec4_visitor::virtual_grf_alloc(int size)
 	 virtual_grf_array_size *= 2;
       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 				   virtual_grf_array_size);
+      virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
+				     virtual_grf_array_size);
    }
+   virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
+   virtual_grf_reg_count += size;
    virtual_grf_sizes[virtual_grf_count] = size;
    return virtual_grf_count++;
 }
@@ -343,9 +507,7 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 
    if (type->is_matrix()) {
-      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
-							type->vector_elements,
-							1);
+      const glsl_type *column = type->column_type();
 
       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 	 offset += setup_uniform_values(loc + offset, column);
@@ -360,35 +522,20 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    case GLSL_TYPE_INT:
    case GLSL_TYPE_BOOL:
       for (unsigned int i = 0; i < type->vector_elements; i++) {
-	 int slot = this->uniforms * 4 + i;
-	 switch (type->base_type) {
-	 case GLSL_TYPE_FLOAT:
-	    c->prog_data.param_convert[slot] = PARAM_NO_CONVERT;
-	    break;
-	 case GLSL_TYPE_UINT:
-	    c->prog_data.param_convert[slot] = PARAM_CONVERT_F2U;
-	    break;
-	 case GLSL_TYPE_INT:
-	    c->prog_data.param_convert[slot] = PARAM_CONVERT_F2I;
-	    break;
-	 case GLSL_TYPE_BOOL:
-	    c->prog_data.param_convert[slot] = PARAM_CONVERT_F2B;
-	    break;
-	 default:
-	    assert(!"not reached");
-	    c->prog_data.param_convert[slot] = PARAM_NO_CONVERT;
-	    break;
-	 }
-	 c->prog_data.param[slot] = &values[i];
+	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
       }
 
+      /* Set up pad elements to get things aligned to a vec4 boundary. */
       for (unsigned int i = type->vector_elements; i < 4; i++) {
-	 c->prog_data.param_convert[this->uniforms * 4 + i] =
-	    PARAM_CONVERT_ZERO;
-	 c->prog_data.param[this->uniforms * 4 + i] = NULL;
+	 static float zero = 0;
+
+	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
       }
 
-      this->uniform_size[this->uniforms] = type->vector_elements;
+      /* Track the size of this uniform vector, for future packing of
+       * uniforms.
+       */
+      this->uniform_vector_size[this->uniforms] = type->vector_elements;
       this->uniforms++;
 
       return 1;
@@ -416,6 +563,37 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    }
 }
 
+void
+vec4_visitor::setup_uniform_clipplane_values()
+{
+   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
+
+   /* Pre-Gen6, we compact clip planes.  For example, if the user
+    * enables just clip planes 0, 1, and 3, we will enable clip planes
+    * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
+    * plane 2.  This simplifies the implementation of the Gen6 clip
+    * thread.
+    *
+    * In Gen6 and later, we don't compact clip planes, because this
+    * simplifies the implementation of gl_ClipDistance.
+    */
+   int compacted_clipplane_index = 0;
+   for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
+      if (intel->gen < 6 &&
+          !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
+         continue;
+      }
+      this->uniform_vector_size[this->uniforms] = 4;
+      this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
+      this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
+      for (int j = 0; j < 4; ++j) {
+         c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
+      }
+      ++compacted_clipplane_index;
+      ++this->uniforms;
+   }
+}
+
 /* Our support for builtin uniforms is even scarier than non-builtin.
  * It sits on top of the PROG_STATE_VAR parameters that are
  * automatically updated from GL context state.
@@ -436,20 +614,18 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 					    (gl_state_index *)slots[i].tokens);
       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 
-      this->uniform_size[this->uniforms] = 0;
+      this->uniform_vector_size[this->uniforms] = 0;
       /* Add each of the unique swizzled channels of the element.
        * This will end up matching the size of the glsl_type of this field.
        */
       int last_swiz = -1;
       for (unsigned int j = 0; j < 4; j++) {
 	 int swiz = GET_SWZ(slots[i].swizzle, j);
-	 if (swiz == last_swiz)
-	    break;
 	 last_swiz = swiz;
 
 	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
-	 c->prog_data.param_convert[this->uniforms * 4 + j] = PARAM_NO_CONVERT;
-	 this->uniform_size[this->uniforms]++;
+	 if (swiz <= last_swiz)
+	    this->uniform_vector_size[this->uniforms]++;
       }
       this->uniforms++;
    }
@@ -462,59 +638,76 @@ vec4_visitor::variable_storage(ir_variable *var)
 }
 
 void
-vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
+vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 {
    ir_expression *expr = ir->as_expression();
 
+   *predicate = BRW_PREDICATE_NORMAL;
+
    if (expr) {
       src_reg op[2];
       vec4_instruction *inst;
 
       assert(expr->get_num_operands() <= 2);
       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar());
-
 	 expr->operands[i]->accept(this);
 	 op[i] = this->result;
+
+	 resolve_ud_negate(&op[i]);
       }
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
+	 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
 	 break;
 
       case ir_binop_logic_xor:
-	 inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
+	 inst = emit(XOR(dst_null_d(), op[0], op[1]));
 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 break;
 
       case ir_binop_logic_or:
-	 inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
+	 inst = emit(OR(dst_null_d(), op[0], op[1]));
 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 break;
 
       case ir_binop_logic_and:
-	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
+	 inst = emit(AND(dst_null_d(), op[0], op[1]));
 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 break;
 
       case ir_unop_f2b:
 	 if (intel->gen >= 6) {
-	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
+	    emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 	 } else {
-	    inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
+	    inst = emit(MOV(dst_null_f(), op[0]));
+	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 }
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 break;
 
       case ir_unop_i2b:
 	 if (intel->gen >= 6) {
-	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+	    emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 	 } else {
-	    inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
+	    inst = emit(MOV(dst_null_d(), op[0]));
+	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 }
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_all_equal:
+	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
+	 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+	 break;
+
+      case ir_binop_any_nequal:
+	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
+	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+	 break;
+
+      case ir_unop_any:
+	 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 	 break;
 
       case ir_binop_greater:
@@ -522,12 +715,9 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
       case ir_binop_less:
       case ir_binop_lequal:
       case ir_binop_equal:
-      case ir_binop_all_equal:
       case ir_binop_nequal:
-      case ir_binop_any_nequal:
-	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
-	 inst->conditional_mod =
-	    brw_conditional_for_comparison(expr->operation);
+	 emit(CMP(dst_null_d(), op[0], op[1],
+		  brw_conditional_for_comparison(expr->operation)));
 	 break;
 
       default:
@@ -539,12 +729,14 @@ vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
 
    ir->accept(this);
 
+   resolve_ud_negate(&this->result);
+
    if (intel->gen >= 6) {
-      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
-			       this->result, src_reg(1));
+      vec4_instruction *inst = emit(AND(dst_null_d(),
+					this->result, src_reg(1)));
       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    } else {
-      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
+      vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    }
 }
@@ -560,52 +752,41 @@ vec4_visitor::emit_if_gen6(ir_if *ir)
 
    if (expr) {
       src_reg op[2];
-      vec4_instruction *inst;
       dst_reg temp;
 
       assert(expr->get_num_operands() <= 2);
       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar() ||
-		expr->operation == ir_binop_any_nequal ||
-		expr->operation == ir_binop_all_equal);
-
 	 expr->operands[i]->accept(this);
 	 op[i] = this->result;
       }
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 	 return;
 
       case ir_binop_logic_xor:
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 	 return;
 
       case ir_binop_logic_or:
 	 temp = dst_reg(this, glsl_type::bool_type);
-	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(OR(temp, op[0], op[1]));
+	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 	 return;
 
       case ir_binop_logic_and:
 	 temp = dst_reg(this, glsl_type::bool_type);
-	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(AND(temp, op[0], op[1]));
+	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 	 return;
 
       case ir_unop_f2b:
-	 inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 	 return;
 
       case ir_unop_i2b:
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 	 return;
 
       case ir_binop_greater:
@@ -614,31 +795,28 @@ vec4_visitor::emit_if_gen6(ir_if *ir)
       case ir_binop_lequal:
       case ir_binop_equal:
       case ir_binop_nequal:
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
-	 inst->conditional_mod =
-	    brw_conditional_for_comparison(expr->operation);
+	 emit(IF(op[0], op[1],
+		 brw_conditional_for_comparison(expr->operation)));
 	 return;
 
       case ir_binop_all_equal:
-	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-
-	 inst = emit(BRW_OPCODE_IF);
-	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
+	 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 	 return;
 
       case ir_binop_any_nequal:
-	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
+	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
+	 return;
 
-	 inst = emit(BRW_OPCODE_IF);
-	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      case ir_unop_any:
+	 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 	 return;
 
       default:
 	 assert(!"not reached");
-	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 	 return;
       }
       return;
@@ -646,9 +824,7 @@ vec4_visitor::emit_if_gen6(ir_if *ir)
 
    ir->condition->accept(this);
 
-   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
-			    this->result, src_reg(0));
-   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 }
 
 void
@@ -662,6 +838,19 @@ vec4_visitor::visit(ir_variable *ir)
    switch (ir->mode) {
    case ir_var_in:
       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
+
+      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+       * come in as floating point conversions of the integer values.
+       */
+      for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
+	 if (!c->key.gl_fixed_input_size[i])
+	    continue;
+
+	 dst_reg dst = *reg;
+         dst.type = brw_type_for_base_type(ir->type);
+	 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
+	 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
+      }
       break;
 
    case ir_var_out:
@@ -670,7 +859,9 @@ vec4_visitor::visit(ir_variable *ir)
       for (int i = 0; i < type_size(ir->type); i++) {
 	 output_reg[ir->location + i] = *reg;
 	 output_reg[ir->location + i].reg_offset = i;
-	 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
+	 output_reg[ir->location + i].type =
+            brw_type_for_base_type(ir->type->get_scalar_type());
+	 output_reg_annotation[ir->location + i] = ir->name;
       }
       break;
 
@@ -682,6 +873,11 @@ vec4_visitor::visit(ir_variable *ir)
    case ir_var_uniform:
       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 
+      /* Track how big the whole uniform variable is, in case we need to put a
+       * copy of its data into pull constants for array access.
+       */
+      this->uniform_size[this->uniforms] = type_size(ir->type);
+
       if (!strncmp(ir->name, "gl_", 3)) {
 	 setup_builtin_uniform_values(ir);
       } else {
@@ -689,6 +885,27 @@ vec4_visitor::visit(ir_variable *ir)
       }
       break;
 
+   case ir_var_system_value:
+      /* VertexID is stored by the VF as the last vertex element, but
+       * we don't represent it with a flag in inputs_read, so we call
+       * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
+       */
+      reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
+      prog_data->uses_vertexid = true;
+
+      switch (ir->location) {
+      case SYSTEM_VALUE_VERTEX_ID:
+	 reg->writemask = WRITEMASK_X;
+	 break;
+      case SYSTEM_VALUE_INSTANCE_ID:
+	 reg->writemask = WRITEMASK_Y;
+	 break;
+      default:
+	 assert(!"not reached");
+	 break;
+      }
+      break;
+
    default:
       assert(!"not reached");
    }
@@ -700,58 +917,46 @@ vec4_visitor::visit(ir_variable *ir)
 void
 vec4_visitor::visit(ir_loop *ir)
 {
-   ir_dereference_variable *counter = NULL;
-
-   fail("not yet\n");
+   dst_reg counter;
 
    /* We don't want debugging output to print the whole body of the
     * loop as the annotation.
     */
    this->base_ir = NULL;
 
-   if (ir->counter != NULL)
-      counter = new(ir) ir_dereference_variable(ir->counter);
-
-   if (ir->from != NULL) {
-      assert(ir->counter != NULL);
+   if (ir->counter != NULL) {
+      this->base_ir = ir->counter;
+      ir->counter->accept(this);
+      counter = *(variable_storage(ir->counter));
 
-      ir_assignment *a = new(ir) ir_assignment(counter, ir->from, NULL);
+      if (ir->from != NULL) {
+	 this->base_ir = ir->from;
+	 ir->from->accept(this);
 
-      a->accept(this);
-      delete a;
+	 emit(MOV(counter, this->result));
+      }
    }
 
    emit(BRW_OPCODE_DO);
 
    if (ir->to) {
-      ir_expression *e =
-	 new(ir) ir_expression(ir->cmp, glsl_type::bool_type,
-			       counter, ir->to);
-      ir_if *if_stmt =  new(ir) ir_if(e);
+      this->base_ir = ir->to;
+      ir->to->accept(this);
 
-      ir_loop_jump *brk = new(ir) ir_loop_jump(ir_loop_jump::jump_break);
+      emit(CMP(dst_null_d(), src_reg(counter), this->result,
+	       brw_conditional_for_comparison(ir->cmp)));
 
-      if_stmt->then_instructions.push_tail(brk);
-
-      if_stmt->accept(this);
-
-      delete if_stmt;
-      delete e;
-      delete brk;
+      vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
+      inst->predicate = BRW_PREDICATE_NORMAL;
    }
 
    visit_instructions(&ir->body_instructions);
 
-   if (ir->increment) {
-      ir_expression *e =
-	 new(ir) ir_expression(ir_binop_add, counter->type,
-			       counter, ir->increment);
 
-      ir_assignment *a = new(ir) ir_assignment(counter, e, NULL);
-
-      a->accept(this);
-      delete a;
-      delete e;
+   if (ir->increment) {
+      this->base_ir = ir->increment;
+      ir->increment->accept(this);
+      emit(ADD(counter, src_reg(counter), this->result));
    }
 
    emit(BRW_OPCODE_WHILE);
@@ -796,7 +1001,7 @@ vec4_visitor::visit(ir_function *ir)
    }
 }
 
-GLboolean
+bool
 vec4_visitor::try_emit_sat(ir_expression *ir)
 {
    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
@@ -808,7 +1013,7 @@ vec4_visitor::try_emit_sat(ir_expression *ir)
 
    this->result = src_reg(this, ir->type);
    vec4_instruction *inst;
-   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
+   inst = emit(MOV(dst_reg(this->result), src));
    inst->saturate = true;
 
    return true;
@@ -822,11 +1027,10 @@ vec4_visitor::emit_bool_comparison(unsigned int op,
    if (intel->gen < 5)
       dst.type = src0.type;
 
-   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
-   inst->conditional_mod = brw_conditional_for_comparison(op);
+   emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 
    dst.type = BRW_REGISTER_TYPE_D;
-   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
+   emit(AND(dst, src_reg(dst), src_reg(0x1)));
 }
 
 void
@@ -884,7 +1088,7 @@ vec4_visitor::visit(ir_expression *ir)
       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
        * ones complement of the whole register, not just bit 0.
        */
-      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
+      emit(XOR(result_dst, op[0], src_reg(1)));
       break;
    case ir_unop_neg:
       op[0].negate = !op[0].negate;
@@ -897,16 +1101,14 @@ vec4_visitor::visit(ir_expression *ir)
       break;
 
    case ir_unop_sign:
-      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
+      emit(MOV(result_dst, src_reg(0.0f)));
 
-      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_G;
-      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
+      emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
+      inst = emit(MOV(result_dst, src_reg(1.0f)));
       inst->predicate = BRW_PREDICATE_NORMAL;
 
-      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_L;
-      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
+      emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
+      inst = emit(MOV(result_dst, src_reg(-1.0f)));
       inst->predicate = BRW_PREDICATE_NORMAL;
 
       break;
@@ -944,19 +1146,40 @@ vec4_visitor::visit(ir_expression *ir)
       break;
 
    case ir_binop_add:
-      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
+      emit(ADD(result_dst, op[0], op[1]));
       break;
    case ir_binop_sub:
       assert(!"not reached: should be handled by ir_sub_to_add_neg");
       break;
 
    case ir_binop_mul:
-      emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
+      if (ir->type->is_integer()) {
+	 /* For integer multiplication, the MUL uses the low 16 bits
+	  * of one of the operands (src0 on gen6, src1 on gen7).  The
+	  * MACH accumulates in the contribution of the upper 16 bits
+	  * of that operand.
+	  *
+	  * FINISHME: Emit just the MUL if we know an operand is small
+	  * enough.
+	  */
+	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
+
+	 emit(MUL(acc, op[0], op[1]));
+	 emit(MACH(dst_null_d(), op[0], op[1]));
+	 emit(MOV(result_dst, src_reg(acc)));
+      } else {
+	 emit(MUL(result_dst, op[0], op[1]));
+      }
       break;
    case ir_binop_div:
-      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
+      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
+      assert(ir->type->is_integer());
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
+      break;
    case ir_binop_mod:
-      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
+      /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
+      assert(ir->type->is_integer());
+      emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_less:
@@ -965,14 +1188,9 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_binop_gequal:
    case ir_binop_equal:
    case ir_binop_nequal: {
-      dst_reg temp = result_dst;
-      /* original gen4 does implicit conversion before comparison. */
-      if (intel->gen < 5)
-	 temp.type = op[0].type;
-
-      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
-      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
+      emit(CMP(result_dst, op[0], op[1],
+	       brw_conditional_for_comparison(ir->operation)));
+      emit(AND(result_dst, result_src, src_reg(0x1)));
       break;
    }
 
@@ -980,65 +1198,48 @@ vec4_visitor::visit(ir_expression *ir)
       /* "==" operator producing a scalar boolean. */
       if (ir->operands[0]->type->is_vector() ||
 	  ir->operands[1]->type->is_vector()) {
-	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-
-	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
-	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
+	 emit(MOV(result_dst, src_reg(0)));
+	 inst = emit(MOV(result_dst, src_reg(1)));
 	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
       } else {
-	 dst_reg temp = result_dst;
-	 /* original gen4 does implicit conversion before comparison. */
-	 if (intel->gen < 5)
-	    temp.type = op[0].type;
-
-	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
+	 emit(AND(result_dst, result_src, src_reg(0x1)));
       }
       break;
    case ir_binop_any_nequal:
       /* "!=" operator producing a scalar boolean. */
       if (ir->operands[0]->type->is_vector() ||
 	  ir->operands[1]->type->is_vector()) {
-	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 
-	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
-	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+	 emit(MOV(result_dst, src_reg(0)));
+	 inst = emit(MOV(result_dst, src_reg(1)));
 	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
       } else {
-	 dst_reg temp = result_dst;
-	 /* original gen4 does implicit conversion before comparison. */
-	 if (intel->gen < 5)
-	    temp.type = op[0].type;
-
-	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
+	 emit(AND(result_dst, result_src, src_reg(0x1)));
       }
       break;
 
    case ir_unop_any:
-      inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      emit(MOV(result_dst, src_reg(0)));
 
-      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
-
-      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+      inst = emit(MOV(result_dst, src_reg(1)));
       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
       break;
 
    case ir_binop_logic_xor:
-      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      emit(XOR(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_logic_or:
-      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      emit(OR(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_logic_and:
-      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      emit(AND(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_dot:
@@ -1060,52 +1261,54 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_unop_b2f:
    case ir_unop_b2i:
    case ir_unop_f2i:
-      emit(BRW_OPCODE_MOV, result_dst, op[0]);
+      emit(MOV(result_dst, op[0]));
       break;
    case ir_unop_f2b:
    case ir_unop_i2b: {
-      dst_reg temp = result_dst;
-      /* original gen4 does implicit conversion before comparison. */
-      if (intel->gen < 5)
-	 temp.type = op[0].type;
-
-      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
+      emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      emit(AND(result_dst, result_src, src_reg(1)));
       break;
    }
 
    case ir_unop_trunc:
-      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
+      emit(RNDZ(result_dst, op[0]));
       break;
    case ir_unop_ceil:
       op[0].negate = !op[0].negate;
-      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      inst = emit(RNDD(result_dst, op[0]));
       this->result.negate = true;
       break;
    case ir_unop_floor:
-      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      inst = emit(RNDD(result_dst, op[0]));
       break;
    case ir_unop_fract:
-      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
+      inst = emit(FRC(result_dst, op[0]));
       break;
    case ir_unop_round_even:
-      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
+      emit(RNDE(result_dst, op[0]));
       break;
 
    case ir_binop_min:
-      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
-      inst->conditional_mod = BRW_CONDITIONAL_L;
+      if (intel->gen >= 6) {
+	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_L;
+      } else {
+	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
 
-      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+      }
       break;
    case ir_binop_max:
-      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
-      inst->conditional_mod = BRW_CONDITIONAL_G;
+      if (intel->gen >= 6) {
+	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_G;
+      } else {
+	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
 
-      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+      }
       break;
 
    case ir_binop_pow:
@@ -1113,21 +1316,27 @@ vec4_visitor::visit(ir_expression *ir)
       break;
 
    case ir_unop_bit_not:
-      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
+      inst = emit(NOT(result_dst, op[0]));
       break;
    case ir_binop_bit_and:
-      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      inst = emit(AND(result_dst, op[0], op[1]));
       break;
    case ir_binop_bit_xor:
-      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      inst = emit(XOR(result_dst, op[0], op[1]));
       break;
    case ir_binop_bit_or:
-      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      inst = emit(OR(result_dst, op[0], op[1]));
       break;
 
    case ir_binop_lshift:
+      inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
+      break;
+
    case ir_binop_rshift:
-      assert(!"GLSL 1.30 features unsupported");
+      if (ir->type->base_type == GLSL_TYPE_INT)
+	 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
+      else
+	 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
       break;
 
    case ir_quadop_vector:
@@ -1225,14 +1434,13 @@ vec4_visitor::visit(ir_dereference_array *ir)
       } else {
 	 index_reg = src_reg(this, glsl_type::int_type);
 
-	 emit(BRW_OPCODE_MUL, dst_reg(index_reg),
-	      this->result, src_reg(element_size));
+	 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
       }
 
       if (src.reladdr) {
 	 src_reg temp = src_reg(this, glsl_type::int_type);
 
-	 emit(BRW_OPCODE_ADD, dst_reg(temp), *src.reladdr, index_reg);
+	 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
 
 	 index_reg = temp;
       }
@@ -1303,18 +1511,18 @@ get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
 
 void
 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
-			      const struct glsl_type *type, bool predicated)
+			      const struct glsl_type *type, uint32_t predicate)
 {
    if (type->base_type == GLSL_TYPE_STRUCT) {
       for (unsigned int i = 0; i < type->length; i++) {
-	 emit_block_move(dst, src, type->fields.structure[i].type, predicated);
+	 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
       }
       return;
    }
 
    if (type->is_array()) {
       for (unsigned int i = 0; i < type->length; i++) {
-	 emit_block_move(dst, src, type->fields.array, predicated);
+	 emit_block_move(dst, src, type->fields.array, predicate);
       }
       return;
    }
@@ -1326,7 +1534,7 @@ vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
 					 type->vector_elements, 1);
 
       for (int i = 0; i < type->matrix_columns; i++) {
-	 emit_block_move(dst, src, vec_type, predicated);
+	 emit_block_move(dst, src, vec_type, predicate);
       }
       return;
    }
@@ -1338,22 +1546,78 @@ vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
 
    dst->writemask = (1 << type->vector_elements) - 1;
 
-   /* Do we need to worry about swizzling a swizzle? */
-   assert(src->swizzle = BRW_SWIZZLE_NOOP);
    src->swizzle = swizzle_for_size(type->vector_elements);
 
-   vec4_instruction *inst = emit(BRW_OPCODE_MOV, *dst, *src);
-   if (predicated)
-      inst->predicate = BRW_PREDICATE_NORMAL;
+   vec4_instruction *inst = emit(MOV(*dst, *src));
+   inst->predicate = predicate;
 
    dst->reg_offset++;
    src->reg_offset++;
 }
 
+
+/* If the RHS processing resulted in an instruction generating a
+ * temporary value, and it would be easy to rewrite the instruction to
+ * generate its result right into the LHS instead, do so.  This ends
+ * up reliably removing instructions where it can be tricky to do so
+ * later without real UD chain information.
+ */
+bool
+vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
+				     dst_reg dst,
+				     src_reg src,
+				     vec4_instruction *pre_rhs_inst,
+				     vec4_instruction *last_rhs_inst)
+{
+   /* This could be supported, but it would take more smarts. */
+   if (ir->condition)
+      return false;
+
+   if (pre_rhs_inst == last_rhs_inst)
+      return false; /* No instructions generated to work with. */
+
+   /* Make sure the last instruction generated our source reg. */
+   if (src.file != GRF ||
+       src.file != last_rhs_inst->dst.file ||
+       src.reg != last_rhs_inst->dst.reg ||
+       src.reg_offset != last_rhs_inst->dst.reg_offset ||
+       src.reladdr ||
+       src.abs ||
+       src.negate ||
+       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
+      return false;
+
+   /* Check that that last instruction fully initialized the channels
+    * we want to use, in the order we want to use them.  We could
+    * potentially reswizzle the operands of many instructions so that
+    * we could handle out of order channels, but don't yet.
+    */
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i)) {
+	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
+	    return false;
+
+	 if (BRW_GET_SWZ(src.swizzle, i) != i)
+	    return false;
+      }
+   }
+
+   /* Success!  Rewrite the instruction. */
+   last_rhs_inst->dst.file = dst.file;
+   last_rhs_inst->dst.reg = dst.reg;
+   last_rhs_inst->dst.reg_offset = dst.reg_offset;
+   last_rhs_inst->dst.reladdr = dst.reladdr;
+   last_rhs_inst->dst.writemask &= dst.writemask;
+
+   return true;
+}
+
 void
 vec4_visitor::visit(ir_assignment *ir)
 {
    dst_reg dst = get_assignment_lhs(ir->lhs, this);
+   uint32_t predicate = BRW_PREDICATE_NONE;
 
    if (!ir->lhs->type->is_scalar() &&
        !ir->lhs->type->is_vector()) {
@@ -1361,17 +1625,32 @@ vec4_visitor::visit(ir_assignment *ir)
       src_reg src = this->result;
 
       if (ir->condition) {
-	 emit_bool_to_cond_code(ir->condition);
+	 emit_bool_to_cond_code(ir->condition, &predicate);
       }
 
-      emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
+      /* emit_block_move doesn't account for swizzles in the source register.
+       * This should be ok, since the source register is a structure or an
+       * array, and those can't be swizzled.  But double-check to be sure.
+       */
+      assert(src.swizzle ==
+             (ir->rhs->type->is_matrix()
+              ? swizzle_for_size(ir->rhs->type->vector_elements)
+              : BRW_SWIZZLE_NOOP));
+
+      emit_block_move(&dst, &src, ir->rhs->type, predicate);
       return;
    }
 
    /* Now we're down to just a scalar/vector with writemasks. */
    int i;
 
+   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
+   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
+
    ir->rhs->accept(this);
+
+   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
+
    src_reg src = this->result;
 
    int swizzles[4];
@@ -1404,15 +1683,17 @@ vec4_visitor::visit(ir_assignment *ir)
    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
 			      swizzles[2], swizzles[3]);
 
+   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
+      return;
+   }
+
    if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
+      emit_bool_to_cond_code(ir->condition, &predicate);
    }
 
    for (i = 0; i < type_size(ir->lhs->type); i++) {
-      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
-
-      if (ir->condition)
-	 inst->predicate = BRW_PREDICATE_NORMAL;
+      vec4_instruction *inst = emit(MOV(dst, src));
+      inst->predicate = predicate;
 
       dst.reg_offset++;
       src.reg_offset++;
@@ -1440,39 +1721,64 @@ vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
 
    if (ir->type->is_matrix()) {
       for (int i = 0; i < ir->type->matrix_columns; i++) {
+	 float *vec = &ir->value.f[i * ir->type->vector_elements];
+
 	 for (int j = 0; j < ir->type->vector_elements; j++) {
 	    dst->writemask = 1 << j;
 	    dst->type = BRW_REGISTER_TYPE_F;
 
-	    emit(BRW_OPCODE_MOV, *dst,
-		 src_reg(ir->value.f[i * ir->type->vector_elements + j]));
+	    emit(MOV(*dst, src_reg(vec[j])));
 	 }
 	 dst->reg_offset++;
       }
       return;
    }
 
+   int remaining_writemask = (1 << ir->type->vector_elements) - 1;
+
    for (int i = 0; i < ir->type->vector_elements; i++) {
+      if (!(remaining_writemask & (1 << i)))
+	 continue;
+
       dst->writemask = 1 << i;
       dst->type = brw_type_for_base_type(ir->type);
 
+      /* Find other components that match the one we're about to
+       * write.  Emits fewer instructions for things like vec4(0.5,
+       * 1.5, 1.5, 1.5).
+       */
+      for (int j = i + 1; j < ir->type->vector_elements; j++) {
+	 if (ir->type->base_type == GLSL_TYPE_BOOL) {
+	    if (ir->value.b[i] == ir->value.b[j])
+	       dst->writemask |= (1 << j);
+	 } else {
+	    /* u, i, and f storage all line up, so no need for a
+	     * switch case for comparing each type.
+	     */
+	    if (ir->value.u[i] == ir->value.u[j])
+	       dst->writemask |= (1 << j);
+	 }
+      }
+
       switch (ir->type->base_type) {
       case GLSL_TYPE_FLOAT:
-	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.f[i]));
+	 emit(MOV(*dst, src_reg(ir->value.f[i])));
 	 break;
       case GLSL_TYPE_INT:
-	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.i[i]));
+	 emit(MOV(*dst, src_reg(ir->value.i[i])));
 	 break;
       case GLSL_TYPE_UINT:
-	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.u[i]));
+	 emit(MOV(*dst, src_reg(ir->value.u[i])));
 	 break;
       case GLSL_TYPE_BOOL:
-	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.b[i]));
+	 emit(MOV(*dst, src_reg(ir->value.b[i])));
 	 break;
       default:
 	 assert(!"Non-float/uint/int/bool constant");
 	 break;
       }
+
+      remaining_writemask &= ~dst->writemask;
    }
    dst->reg_offset++;
 }
@@ -1495,7 +1801,178 @@ vec4_visitor::visit(ir_call *ir)
 void
 vec4_visitor::visit(ir_texture *ir)
 {
-   assert(!"not reached");
+   int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
+   sampler = vp->Base.SamplerUnits[sampler];
+
+   /* Should be lowered by do_lower_texture_projection */
+   assert(!ir->projector);
+
+   vec4_instruction *inst = NULL;
+   switch (ir->op) {
+   case ir_tex:
+   case ir_txl:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
+      break;
+   case ir_txd:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
+      break;
+   case ir_txf:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
+      break;
+   case ir_txs:
+      inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
+      break;
+   case ir_txb:
+      assert(!"TXB is not valid for vertex shaders.");
+   }
+
+   /* Texel offsets go in the message header; Gen4 also requires headers. */
+   inst->header_present = ir->offset || intel->gen < 5;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_present + 1; /* always at least one */
+   inst->sampler = sampler;
+   inst->dst = dst_reg(this, ir->type);
+   inst->shadow_compare = ir->shadow_comparitor != NULL;
+
+   if (ir->offset != NULL)
+      inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_present;
+
+   if (ir->op == ir_txs) {
+      ir->lod_info.lod->accept(this);
+      int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
+	   this->result));
+   } else {
+      int i, coord_mask = 0, zero_mask = 0;
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      for (i = 0; i < ir->coordinate->type->vector_elements; i++)
+	 coord_mask |= (1 << i);
+      for (; i < 4; i++)
+	 zero_mask |= (1 << i);
+
+      ir->coordinate->accept(this);
+      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
+	       this->result));
+      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
+	       src_reg(0)));
+      /* Load the shadow comparitor */
+      if (ir->shadow_comparitor) {
+	 ir->shadow_comparitor->accept(this);
+	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
+			  WRITEMASK_X),
+		  this->result));
+	 inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      if (ir->op == ir_txl) {
+	 int mrf, writemask;
+	 if (intel->gen >= 5) {
+	    mrf = param_base + 1;
+	    if (ir->shadow_comparitor) {
+	       writemask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       writemask = WRITEMASK_X;
+	       inst->mlen++;
+	    }
+	 } else /* intel->gen == 4 */ {
+	    mrf = param_base;
+	    writemask = WRITEMASK_Z;
+	 }
+	 ir->lod_info.lod->accept(this);
+	 emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
+		  this->result));
+      } else if (ir->op == ir_txf) {
+	 ir->lod_info.lod->accept(this);
+	 emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
+		  this->result));
+      } else if (ir->op == ir_txd) {
+	 const glsl_type *type = ir->lod_info.grad.dPdx->type;
+
+	 ir->lod_info.grad.dPdx->accept(this);
+	 src_reg dPdx = this->result;
+	 ir->lod_info.grad.dPdy->accept(this);
+	 src_reg dPdy = this->result;
+
+	 if (intel->gen >= 5) {
+	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
+	    inst->mlen++;
+
+	    if (ir->type->vector_elements == 3) {
+	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
+	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
+	       inst->mlen++;
+	    }
+	 } else /* intel->gen == 4 */ {
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
+	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
+	    inst->mlen += 2;
+	 }
+      }
+   }
+
+   emit(inst);
+
+   swizzle_result(ir, src_reg(inst->dst), sampler);
+}
+
+void
+vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
+{
+   this->result = orig_val;
+
+   int s = c->key.tex.swizzles[sampler];
+
+   if (ir->op == ir_txs || ir->type == glsl_type::float_type
+			|| s == SWIZZLE_NOOP)
+      return;
+
+   int zero_mask = 0, one_mask = 0, copy_mask = 0;
+   int swizzle[4];
+
+   for (int i = 0; i < 4; i++) {
+      switch (GET_SWZ(s, i)) {
+      case SWIZZLE_ZERO:
+	 zero_mask |= (1 << i);
+	 break;
+      case SWIZZLE_ONE:
+	 one_mask |= (1 << i);
+	 break;
+      default:
+	 copy_mask |= (1 << i);
+	 swizzle[i] = GET_SWZ(s, i);
+	 break;
+      }
+   }
+
+   this->result = src_reg(this, ir->type);
+   dst_reg swizzled_result(this->result);
+
+   if (copy_mask) {
+      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+      swizzled_result.writemask = copy_mask;
+      emit(MOV(swizzled_result, orig_val));
+   }
+
+   if (zero_mask) {
+      swizzled_result.writemask = zero_mask;
+      emit(MOV(swizzled_result, src_reg(0.0f)));
+   }
+
+   if (one_mask) {
+      swizzled_result.writemask = one_mask;
+      emit(MOV(swizzled_result, src_reg(1.0f)));
+   }
 }
 
 void
@@ -1521,9 +1998,9 @@ vec4_visitor::visit(ir_if *ir)
    if (intel->gen == 6) {
       emit_if_gen6(ir);
    } else {
-      emit_bool_to_cond_code(ir->condition);
-      vec4_instruction *inst = emit(BRW_OPCODE_IF);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+      uint32_t predicate;
+      emit_bool_to_cond_code(ir->condition, &predicate);
+      emit(IF(predicate));
    }
 
    visit_instructions(&ir->then_instructions);
@@ -1539,14 +2016,15 @@ vec4_visitor::visit(ir_if *ir)
    emit(BRW_OPCODE_ENDIF);
 }
 
-int
-vec4_visitor::emit_vue_header_gen4(int header_mrf)
+void
+vec4_visitor::emit_ndc_computation()
 {
    /* Get the position */
    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
 
    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
+   output_reg[BRW_VERT_RESULT_NDC] = ndc;
 
    current_annotation = "NDC";
    dst_reg ndc_w = ndc;
@@ -1558,32 +2036,39 @@ vec4_visitor::emit_vue_header_gen4(int header_mrf)
    dst_reg ndc_xyz = ndc;
    ndc_xyz.writemask = WRITEMASK_XYZ;
 
-   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
+   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
+}
 
-   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || brw->has_negative_rhw_bug) {
+void
+vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
+{
+   if (intel->gen < 6 &&
+       ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+        c->key.userclip_active || brw->has_negative_rhw_bug)) {
       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
+      dst_reg header1_w = header1;
+      header1_w.writemask = WRITEMASK_W;
       GLuint i;
 
-      emit(BRW_OPCODE_MOV, header1, 0u);
+      emit(MOV(header1, 0u));
 
       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
-	 assert(!"finishme: psiz");
-	 src_reg psiz;
+	 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
 
-	 header1.writemask = WRITEMASK_W;
-	 emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
-	 emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
+	 current_annotation = "Point size";
+	 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
+	 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
       }
 
-      for (i = 0; i < c->key.nr_userclip; i++) {
+      current_annotation = "Clipping flags";
+      for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
 	 vec4_instruction *inst;
 
-	 inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
-		     pos, src_reg(c->userplane[i]));
+	 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
+                         src_reg(this->userplane[i])));
 	 inst->conditional_mod = BRW_CONDITIONAL_L;
 
-	 emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
+	 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
 	 inst->predicate = BRW_PREDICATE_NORMAL;
       }
 
@@ -1602,104 +2087,120 @@ vec4_visitor::emit_vue_header_gen4(int header_mrf)
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
-		 brw_swizzle1(ndc, 3),
+		 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
 		 brw_imm_f(0));
 
 	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
-	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 #endif
       }
 
-      header1.writemask = WRITEMASK_XYZW;
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
-   } else {
-      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
-				  BRW_REGISTER_TYPE_UD), 0u);
-   }
-
-   if (intel->gen == 5) {
-      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
-       * dword 0-3 (m1) of the header is indices, point width, clip flags.
-       * dword 4-7 (m2) is the ndc position (set above)
-       * dword 8-11 (m3) of the vertex header is the 4D space position
-       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
-       * m6 is a pad so that the vertex element data is aligned
-       * m7 is the first vertex data we fill.
-       */
-      current_annotation = "NDC";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
-
-      current_annotation = "gl_Position";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
-
-      /* user clip distance. */
-      header_mrf += 2;
-
-      /* Pad so that vertex element data is aligned. */
-      header_mrf++;
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
+   } else if (intel->gen < 6) {
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
    } else {
-      /* There are 8 dwords in VUE header pre-Ironlake:
-       * dword 0-3 (m1) is indices, point width, clip flags.
-       * dword 4-7 (m2) is ndc position (set above)
-       *
-       * dword 8-11 (m3) is the first vertex data.
-       */
-      current_annotation = "NDC";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
-
-      current_annotation = "gl_Position";
-      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+         emit(MOV(brw_writemask(reg, WRITEMASK_W),
+                  src_reg(output_reg[VERT_RESULT_PSIZ])));
+      }
    }
-
-   return header_mrf;
 }
 
-int
-vec4_visitor::emit_vue_header_gen6(int header_mrf)
+void
+vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
 {
-   struct brw_reg reg;
+   if (intel->gen < 6) {
+      /* Clip distance slots are set aside in gen5, but they are not used.  It
+       * is not clear whether we actually need to set aside space for them,
+       * but the performance cost is negligible.
+       */
+      return;
+   }
 
-   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
-    * dword 0-3 (m2) of the header is indices, point width, clip flags.
-    * dword 4-7 (m3) is the 4D space position
-    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
-    * enabled.
+   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
     *
-    * m4 or 6 is the first vertex element data we fill.
+    *     "If a linked set of shaders forming the vertex stage contains no
+    *     static write to gl_ClipVertex or gl_ClipDistance, but the
+    *     application has requested clipping against user clip planes through
+    *     the API, then the coordinate written to gl_Position is used for
+    *     comparison against the user clip planes."
+    *
+    * This function is only called if the shader didn't write to
+    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
+    * if the user wrote to it; otherwise we use gl_Position.
     */
+   gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
+   if (!(c->prog_data.outputs_written
+         & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
+      clip_vertex = VERT_RESULT_HPOS;
+   }
 
-   current_annotation = "indices, point width, clip flags";
-   reg = brw_message_reg(header_mrf++);
-   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
-   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
-      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
-	   src_reg(output_reg[VERT_RESULT_PSIZ]));
-   }
-
-   current_annotation = "gl_Position";
-   emit(BRW_OPCODE_MOV,
-	brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
-
-   current_annotation = "user clip distances";
-   if (c->key.nr_userclip) {
-      for (int i = 0; i < c->key.nr_userclip; i++) {
-	 struct brw_reg m;
-	 if (i < 4)
-	    m = brw_message_reg(header_mrf);
-	 else
-	    m = brw_message_reg(header_mrf + 1);
-
-	 emit(BRW_OPCODE_DP4,
-	      dst_reg(brw_writemask(m, 1 << (i & 3))),
-	      src_reg(c->userplane[i]));
-      }
-      header_mrf += 2;
+   for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
+        ++i) {
+      emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
+               src_reg(output_reg[clip_vertex]),
+               src_reg(this->userplane[i + offset])));
    }
+}
 
-   current_annotation = NULL;
+void
+vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
+{
+   assert (vert_result < VERT_RESULT_MAX);
+   reg.type = output_reg[vert_result].type;
+   current_annotation = output_reg_annotation[vert_result];
+   /* Copy the register, saturating if necessary */
+   vec4_instruction *inst = emit(MOV(reg,
+                                     src_reg(output_reg[vert_result])));
+   if ((vert_result == VERT_RESULT_COL0 ||
+        vert_result == VERT_RESULT_COL1 ||
+        vert_result == VERT_RESULT_BFC0 ||
+        vert_result == VERT_RESULT_BFC1) &&
+       c->key.clamp_vertex_color) {
+      inst->saturate = true;
+   }
+}
 
-   return header_mrf;
+void
+vec4_visitor::emit_urb_slot(int mrf, int vert_result)
+{
+   struct brw_reg hw_reg = brw_message_reg(mrf);
+   dst_reg reg = dst_reg(MRF, mrf);
+   reg.type = BRW_REGISTER_TYPE_F;
+
+   switch (vert_result) {
+   case VERT_RESULT_PSIZ:
+      /* PSIZ is always in slot 0, and is coupled with other flags. */
+      current_annotation = "indices, point width, clip flags";
+      emit_psiz_and_flags(hw_reg);
+      break;
+   case BRW_VERT_RESULT_NDC:
+      current_annotation = "NDC";
+      emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
+      break;
+   case BRW_VERT_RESULT_HPOS_DUPLICATE:
+   case VERT_RESULT_HPOS:
+      current_annotation = "gl_Position";
+      emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
+      break;
+   case VERT_RESULT_CLIP_DIST0:
+   case VERT_RESULT_CLIP_DIST1:
+      if (this->c->key.uses_clip_distance) {
+         emit_generic_urb_slot(reg, vert_result);
+      } else {
+         current_annotation = "user clip distances";
+         emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
+      }
+      break;
+   case BRW_VERT_RESULT_PAD:
+      /* No need to write to this slot */
+      break;
+   default:
+      emit_generic_urb_slot(reg, vert_result);
+      break;
+   }
 }
 
 static int
@@ -1737,14 +2238,18 @@ vec4_visitor::emit_urb_writes()
     */
    int base_mrf = 1;
    int mrf = base_mrf;
-   int urb_entry_size;
-   uint64_t outputs_remaining = c->prog_data.outputs_written;
    /* In the process of generating our URB write message contents, we
     * may need to unspill a register or load from an array.  Those
     * reads would use MRFs 14-15.
     */
    int max_usable_mrf = 13;
 
+   /* The following assertion verifies that max_usable_mrf causes an
+    * even-numbered amount of URB write data, which will meet gen6's
+    * requirements for length alignment.
+    */
+   assert ((max_usable_mrf - base_mrf) % 2 == 0);
+
    /* FINISHME: edgeflag */
 
    /* First mrf is the g0-based message header containing URB handles and such,
@@ -1752,63 +2257,41 @@ vec4_visitor::emit_urb_writes()
     */
    mrf++;
 
-   if (intel->gen >= 6) {
-      mrf = emit_vue_header_gen6(mrf);
-   } else {
-      mrf = emit_vue_header_gen4(mrf);
+   if (intel->gen < 6) {
+      emit_ndc_computation();
    }
 
    /* Set up the VUE data for the first URB write */
-   int attr;
-   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
-      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
-	 continue;
-
-      outputs_remaining &= ~BITFIELD64_BIT(attr);
-
-      /* This is set up in the VUE header. */
-      if (attr == VERT_RESULT_HPOS)
-	 continue;
-
-      /* This is loaded into the VUE header, and thus doesn't occupy
-       * an attribute slot.
-       */
-      if (attr == VERT_RESULT_PSIZ)
-	 continue;
-
-      emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
+   int slot;
+   for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
+      emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
 
-      /* If this was MRF 15, we can't fit anything more into this URB
-       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
-       * even-numbered amount of URB write data, which will meet
-       * gen6's requirements for length alignment.
+      /* If this was max_usable_mrf, we can't fit anything more into this URB
+       * WRITE.
        */
       if (mrf > max_usable_mrf) {
-	 attr++;
+	 slot++;
 	 break;
       }
    }
 
+   current_annotation = "URB write";
    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
    inst->base_mrf = base_mrf;
    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
-   inst->eot = !outputs_remaining;
-
-   urb_entry_size = mrf - base_mrf;
+   inst->eot = (slot >= c->prog_data.vue_map.num_slots);
 
    /* Optional second URB write */
-   if (outputs_remaining) {
+   if (!inst->eot) {
       mrf = base_mrf + 1;
 
-      for (; attr < VERT_RESULT_MAX; attr++) {
-	 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
-	    continue;
-
+      for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
 	 assert(mrf < max_usable_mrf);
 
-	 emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
+         emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
       }
 
+      current_annotation = "URB write";
       inst = emit(VS_OPCODE_URB_WRITE);
       inst->base_mrf = base_mrf;
       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
@@ -1819,14 +2302,7 @@ vec4_visitor::emit_urb_writes()
        * those, since we're doing interleaved writes.
        */
       inst->offset = (max_usable_mrf - base_mrf) / 2;
-
-      urb_entry_size += mrf - base_mrf;
    }
-
-   if (intel->gen == 6)
-      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
-   else
-      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
 }
 
 src_reg
@@ -1847,21 +2323,35 @@ vec4_visitor::get_scratch_offset(vec4_instruction *inst,
    if (reladdr) {
       src_reg index = src_reg(this, glsl_type::int_type);
 
-      vec4_instruction *add = emit(BRW_OPCODE_ADD,
-				   dst_reg(index),
-				   *reladdr,
-				   src_reg(reg_offset));
-      /* Move our new instruction from the tail to its correct place. */
-      add->remove();
-      inst->insert_before(add);
+      emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
+      emit_before(inst, MUL(dst_reg(index),
+			    index, src_reg(message_header_scale)));
 
-      vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
-				   index, src_reg(message_header_scale));
-      mul->remove();
-      inst->insert_before(mul);
+      return index;
+   } else {
+      return src_reg(reg_offset * message_header_scale);
+   }
+}
+
+src_reg
+vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
+				       src_reg *reladdr, int reg_offset)
+{
+   if (reladdr) {
+      src_reg index = src_reg(this, glsl_type::int_type);
+
+      emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
+
+      /* Pre-gen6, the message header uses byte offsets instead of vec4
+       * (16-byte) offset units.
+       */
+      if (intel->gen < 6) {
+	 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
+      }
 
       return index;
    } else {
+      int message_header_scale = intel->gen < 6 ? 16 : 1;
       return src_reg(reg_offset * message_header_scale);
    }
 }
@@ -1878,14 +2368,7 @@ vec4_visitor::emit_scratch_read(vec4_instruction *inst,
    int reg_offset = base_offset + orig_src.reg_offset;
    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
 
-   vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
-					      temp, index);
-
-   scratch_read_inst->base_mrf = 14;
-   scratch_read_inst->mlen = 1;
-   /* Move our instruction from the tail to its correct place. */
-   scratch_read_inst->remove();
-   inst->insert_before(scratch_read_inst);
+   emit_before(inst, SCRATCH_READ(temp, index));
 }
 
 /**
@@ -1902,14 +2385,11 @@ vec4_visitor::emit_scratch_write(vec4_instruction *inst,
 
    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
 				       orig_dst.writemask));
-   vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
-					       dst, temp, index);
-   scratch_write_inst->base_mrf = 13;
-   scratch_write_inst->mlen = 2;
-   scratch_write_inst->predicate = inst->predicate;
-   /* Move our instruction from the tail to its correct place. */
-   scratch_write_inst->remove();
-   inst->insert_after(scratch_write_inst);
+   vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
+   write->predicate = inst->predicate;
+   write->ir = inst->ir;
+   write->annotation = inst->annotation;
+   inst->insert_after(write);
 }
 
 /**
@@ -1991,6 +2471,110 @@ vec4_visitor::move_grf_array_access_to_scratch()
    }
 }
 
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from the pull constant buffer (surface) at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
+				      dst_reg temp, src_reg orig_src,
+				      int base_offset)
+{
+   int reg_offset = base_offset + orig_src.reg_offset;
+   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
+   vec4_instruction *load;
+
+   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
+					temp, index);
+   load->base_mrf = 14;
+   load->mlen = 1;
+   emit_before(inst, load);
+}
+
+/**
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants.  In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+vec4_visitor::move_uniform_array_access_to_pull_constants()
+{
+   int pull_constant_loc[this->uniforms];
+
+   for (int i = 0; i < this->uniforms; i++) {
+      pull_constant_loc[i] = -1;
+   }
+
+   /* Walk through and find array access of uniforms.  Put a copy of that
+    * uniform in the pull constant buffer.
+    *
+    * Note that we don't move constant-indexed accesses to arrays.  No
+    * testing has been done of the performance impact of this choice.
+    */
+   foreach_list_safe(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      for (int i = 0 ; i < 3; i++) {
+	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+	    continue;
+
+	 int uniform = inst->src[i].reg;
+
+	 /* If this array isn't already present in the pull constant buffer,
+	  * add it.
+	  */
+	 if (pull_constant_loc[uniform] == -1) {
+	    const float **values = &prog_data->param[uniform * 4];
+
+	    pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
+
+	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
+	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
+	    }
+	 }
+
+	 /* Set up the annotation tracking for new generated instructions. */
+	 base_ir = inst->ir;
+	 current_annotation = inst->annotation;
+
+	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+
+	 emit_pull_constant_load(inst, temp, inst->src[i],
+				 pull_constant_loc[uniform]);
+
+	 inst->src[i].file = temp.file;
+	 inst->src[i].reg = temp.reg;
+	 inst->src[i].reg_offset = temp.reg_offset;
+	 inst->src[i].reladdr = NULL;
+      }
+   }
+
+   /* Now there are no accesses of the UNIFORM file with a reladdr, so
+    * no need to track them as larger-than-vec4 objects.  This will be
+    * relied on in cutting out unused uniform vectors from push
+    * constants.
+    */
+   split_uniform_registers();
+}
+
+void
+vec4_visitor::resolve_ud_negate(src_reg *reg)
+{
+   if (reg->type != BRW_REGISTER_TYPE_UD ||
+       !reg->negate)
+      return;
+
+   src_reg temp = src_reg(this, glsl_type::uvec4_type);
+   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
+   *reg = temp;
+}
 
 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
 			   struct gl_shader_program *prog,
@@ -2011,26 +2595,31 @@ vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
    this->current_annotation = NULL;
 
    this->c = c;
-   this->vp = brw->vertex_program; /* FINISHME: change for precompile */
+   this->vp = (struct gl_vertex_program *)
+     prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
    this->prog_data = &c->prog_data;
 
    this->variable_ht = hash_table_ctor(0,
 				       hash_table_pointer_hash,
 				       hash_table_pointer_compare);
 
+   this->virtual_grf_def = NULL;
+   this->virtual_grf_use = NULL;
    this->virtual_grf_sizes = NULL;
    this->virtual_grf_count = 0;
+   this->virtual_grf_reg_map = NULL;
+   this->virtual_grf_reg_count = 0;
    this->virtual_grf_array_size = 0;
+   this->live_intervals_valid = false;
 
-   this->uniforms = 0;
+   this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
 
-   this->variable_ht = hash_table_ctor(0,
-				       hash_table_pointer_hash,
-				       hash_table_pointer_compare);
+   this->uniforms = 0;
 }
 
 vec4_visitor::~vec4_visitor()
 {
+   ralloc_free(this->mem_ctx);
    hash_table_dtor(this->variable_ht);
 }