i965: Add 64-bit integer support for conversions and bitcasts

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_nir.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp

index 3253c291453468d6ebb4797d4adb5bb29922aa44..67ee94a4e10de484c055c609de65966811696d96 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()
     /* emit the arrays used for inputs and outputs - load/store intrinsics will
      * be converted to reads/writes of these arrays
      */
-   nir_setup_inputs();
     nir_setup_outputs();
     nir_setup_uniforms();
     nir_emit_system_values();
@@ -49,132 +48,20 @@ fs_visitor::emit_nir_code()
     }
  }
  
-void
-fs_visitor::nir_setup_inputs()
-{
-   if (stage != MESA_SHADER_FRAGMENT)
-      return;
-
-   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
-
-   nir_foreach_variable(var, &nir->inputs) {
-      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
-
-      fs_reg reg;
-      if (var->data.location == VARYING_SLOT_POS) {
-         reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
-                                             var->data.origin_upper_left);
-         emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
-                                   input, reg), 0xF);
-      } else if (var->data.location == VARYING_SLOT_LAYER) {
-         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, 1), 3);
-         reg.type = BRW_REGISTER_TYPE_D;
-         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
-      } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
-         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT, 2), 3);
-         reg.type = BRW_REGISTER_TYPE_D;
-         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
-      } else {
-         int location = var->data.location;
-         emit_general_interpolation(&input, var->name, var->type,
-                                    (glsl_interp_qualifier) var->data.interpolation,
-                                    &location, var->data.centroid,
-                                    var->data.sample);
-      }
-   }
-}
-
-void
-fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
-                                            const glsl_type *type,
-                                            unsigned *location)
-{
-   if (type->is_array() || type->is_matrix()) {
-      const struct glsl_type *elem_type = glsl_get_array_element(type);
-      const unsigned length = glsl_get_length(type);
-
-      for (unsigned i = 0; i < length; i++) {
-         nir_setup_single_output_varying(reg, elem_type, location);
-      }
-   } else if (type->is_record()) {
-      for (unsigned i = 0; i < type->length; i++) {
-         const struct glsl_type *field_type = type->fields.structure[i].type;
-         nir_setup_single_output_varying(reg, field_type, location);
-      }
-   } else {
-      assert(type->is_scalar() || type->is_vector());
-      unsigned num_elements = type->vector_elements;
-      if (type->is_double())
-         num_elements *= 2;
-      for (unsigned count = 0; count < num_elements; count += 4) {
-         this->outputs[*location] = *reg;
-         this->output_components[*location] = MIN2(4, num_elements - count);
-         *reg = offset(*reg, bld, 4);
-         (*location)++;
-      }
-   }
-}
-
  void
  fs_visitor::nir_setup_outputs()
  {
-   if (stage == MESA_SHADER_TESS_CTRL)
+   if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
        return;
  
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
-
     nir_foreach_variable(var, &nir->outputs) {
-      fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
-
-      switch (stage) {
-      case MESA_SHADER_VERTEX:
-      case MESA_SHADER_TESS_EVAL:
-      case MESA_SHADER_GEOMETRY: {
-         unsigned location = var->data.location;
-         nir_setup_single_output_varying(&reg, var->type, &location);
-         break;
-      }
-      case MESA_SHADER_FRAGMENT:
-         if (key->force_dual_color_blend &&
-             var->data.location == FRAG_RESULT_DATA1) {
-            this->dual_src_output = reg;
-            this->do_dual_src = true;
-         } else if (var->data.index > 0) {
-            assert(var->data.location == FRAG_RESULT_DATA0);
-            assert(var->data.index == 1);
-            this->dual_src_output = reg;
-            this->do_dual_src = true;
-         } else if (var->data.location == FRAG_RESULT_COLOR) {
-            /* Writing gl_FragColor outputs to all color regions. */
-            for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
-               this->outputs[i] = reg;
-               this->output_components[i] = 4;
-            }
-         } else if (var->data.location == FRAG_RESULT_DEPTH) {
-            this->frag_depth = reg;
-         } else if (var->data.location == FRAG_RESULT_STENCIL) {
-            this->frag_stencil = reg;
-         } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
-            this->sample_mask = reg;
-         } else {
-            int vector_elements = var->type->without_array()->vector_elements;
-
-            /* gl_FragData or a user-defined FS output */
-            assert(var->data.location >= FRAG_RESULT_DATA0 &&
-                   var->data.location < FRAG_RESULT_DATA0+BRW_MAX_DRAW_BUFFERS);
-
-            /* General color output. */
-            for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
-               int output = var->data.location - FRAG_RESULT_DATA0 + i;
-               this->outputs[output] = offset(reg, bld, vector_elements * i);
-               this->output_components[output] = vector_elements;
-            }
-         }
-         break;
-      default:
-         unreachable("unhandled shader stage");
+      const unsigned vec4s =
+         var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
+                           : type_size_vec4(var->type);
+      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
+      for (unsigned i = 0; i < vec4s; i++) {
+         if (outputs[var->data.driver_location + i].file == BAD_FILE)
+            outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
        }
     }
  }
@@ -182,7 +69,7 @@ fs_visitor::nir_setup_outputs()
  void
  fs_visitor::nir_setup_uniforms()
  {
-   if (dispatch_width != 8)
+   if (dispatch_width != min_dispatch_width)
        return;
  
     uniforms = nir->num_uniforms / 4;
@@ -273,13 +160,6 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
              *reg = *v->emit_samplemaskin_setup();
           break;
  
-      case nir_intrinsic_load_local_invocation_id:
-         assert(v->stage == MESA_SHADER_COMPUTE);
-         reg = &v->nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID];
-         if (reg->file == BAD_FILE)
-            *reg = *v->emit_cs_local_invocation_id_setup();
-         break;
-
        case nir_intrinsic_load_work_group_id:
           assert(v->stage == MESA_SHADER_COMPUTE);
           reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
@@ -310,7 +190,7 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
                       stride(byte_offset(retype(brw_vec1_grf(1, 0),
                                                 BRW_REGISTER_TYPE_UB), 28),
                              1, 8, 0),
-                     brw_imm_uv(0x76543210));
+                     brw_imm_v(0x76543210));
  
              /* A set bit in the pixel mask means the channel is enabled, but
               * that is the opposite of gl_HelperInvocation so we need to invert
@@ -493,7 +373,10 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
        break;
  
     case nir_instr_type_ssa_undef:
-      nir_emit_undef(abld, nir_instr_as_ssa_undef(instr));
+      /* We create a new VGRF for undefs on every use (by handling
+       * them in get_nir_src()), rather than for each definition.
+       * This helps register coalescing eliminate MOVs from undef.
+       */
        break;
  
     case nir_instr_type_jump:
@@ -530,14 +413,10 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
     nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
     assert(element != NULL);
  
-   enum opcode extract_op;
-   if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) {
-      assert(element->u32[0] <= 1);
-      extract_op = SHADER_OPCODE_EXTRACT_WORD;
-   } else {
-      assert(element->u32[0] <= 3);
-      extract_op = SHADER_OPCODE_EXTRACT_BYTE;
-   }
+   /* Element type to extract.*/
+   const brw_reg_type type = brw_int_type(
+      src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
+      src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
  
     fs_reg op0 = get_nir_src(src0->src[0].src);
     op0.type = brw_type_for_nir_type(
@@ -546,7 +425,7 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
     op0 = offset(op0, bld, src0->src[0].swizzle[0]);
  
     set_saturate(instr->dest.saturate,
-                bld.emit(extract_op, result, op0, brw_imm_ud(element->u32[0])));
+                bld.MOV(result, subscript(op0, type, element->u32[0])));
     return true;
  }
  
@@ -593,15 +472,8 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
           g0.negate = true;
        }
  
-      tmp.type = BRW_REGISTER_TYPE_W;
-      tmp.subreg_offset = 2;
-      tmp.stride = 2;
-
-      bld.OR(tmp, g0, brw_imm_uw(0x3f80));
-
-      tmp.type = BRW_REGISTER_TYPE_D;
-      tmp.subreg_offset = 0;
-      tmp.stride = 1;
+      bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
+             g0, brw_imm_uw(0x3f80));
     } else {
        /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
        fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
@@ -628,6 +500,53 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
     return true;
  }
  
+static void
+emit_find_msb_using_lzd(const fs_builder &bld,
+                        const fs_reg &result,
+                        const fs_reg &src,
+                        bool is_signed)
+{
+   fs_inst *inst;
+   fs_reg temp = src;
+
+   if (is_signed) {
+      /* LZD of an absolute value source almost always does the right
+       * thing.  There are two problem values:
+       *
+       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
+       *   0.  However, findMSB(int(0x80000000)) == 30.
+       *
+       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
+       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
+       *
+       *    For a value of zero or negative one, -1 will be returned.
+       *
+       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
+       *   findMSB(-(1<<x)) should return x-1.
+       *
+       * For all negative number cases, including 0x80000000 and
+       * 0xffffffff, the correct value is obtained from LZD if instead of
+       * negating the (already negative) value the logical-not is used.  A
+       * conditonal logical-not can be achieved in two instructions.
+       */
+      temp = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+      bld.ASR(temp, src, brw_imm_d(31));
+      bld.XOR(temp, temp, src);
+   }
+
+   bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
+           retype(temp, BRW_REGISTER_TYPE_UD));
+
+   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
+    * from the LSB side. Subtract the result from 31 to convert the MSB
+    * count into an LSB count.  If no bits are set, LZD will return 32.
+    * 31-32 = -1, which is exactly what findMSB() is supposed to return.
+    */
+   inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
+   inst->src[0].negate = true;
+}
+
  void
  fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
  {
@@ -727,15 +646,53 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
     case nir_op_u2f:
        if (optimize_extract_to_float(instr, result))
           return;
+      inst = bld.MOV(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
  
     case nir_op_f2d:
     case nir_op_i2d:
     case nir_op_u2d:
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+   case nir_op_i2i64:
+   case nir_op_i2u64:
+   case nir_op_u2i64:
+   case nir_op_u2u64:
+   case nir_op_b2i64:
+      /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
+       *
+       *    "When source or destination is 64b (...), regioning in Align1
+       *     must follow these rules:
+       *
+       *     1. Source and destination horizontal stride must be aligned to
+       *        the same qword.
+       *     (...)"
+       *
+       * This means that 32-bit to 64-bit conversions need to have the 32-bit
+       * data elements aligned to 64-bit. This restriction does not apply to
+       * BDW and later.
+       */
+      if (nir_dest_bit_size(instr->dest.dest) == 64 &&
+          nir_src_bit_size(instr->src[0].src) == 32 &&
+          (devinfo->is_cherryview || devinfo->is_broxton)) {
+         fs_reg tmp = bld.vgrf(result.type, 1);
+         tmp = subscript(tmp, op[0].type, 0);
+         inst = bld.MOV(tmp, op[0]);
+         inst = bld.MOV(result, tmp);
+         inst->saturate = instr->dest.saturate;
+         break;
+      }
+      /* fallthrough */
     case nir_op_d2f:
     case nir_op_d2i:
     case nir_op_d2u:
-      inst = bld.MOV(result, op[0]);
-      inst->saturate = instr->dest.saturate;
+      if (instr->op == nir_op_b2i64) {
+         bld.MOV(result, negate(op[0]));
+      } else {
+         inst = bld.MOV(result, op[0]);
+         inst->saturate = instr->dest.saturate;
+      }
        break;
  
     case nir_op_f2i:
@@ -776,7 +733,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
            * a register and compare with that.
            */
           fs_reg tmp = vgrf(glsl_type::double_type);
-         bld.MOV(tmp, brw_imm_df(0.0));
+         bld.MOV(tmp, setup_imm_df(bld, 0.0));
  
           /* A direct DF CMP using the flag register (null dst) won't work in
            * SIMD16 because the CMP will be split in two by lower_simd_width,
@@ -879,22 +836,18 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
     case nir_op_fddy:
        if (fs_key->high_quality_derivatives) {
-         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
-                         brw_imm_d(fs_key->render_to_fbo));
+         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
        } else {
-         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                         brw_imm_d(fs_key->render_to_fbo));
+         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
        }
        inst->saturate = instr->dest.saturate;
        break;
     case nir_op_fddy_fine:
-      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
-                      brw_imm_d(fs_key->render_to_fbo));
+      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
        inst->saturate = instr->dest.saturate;
        break;
     case nir_op_fddy_coarse:
-      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                      brw_imm_d(fs_key->render_to_fbo));
+      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
        inst->saturate = instr->dest.saturate;
        break;
  
@@ -1119,7 +1072,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
     case nir_op_d2b: {
        /* two-argument instructions can't take 64-bit immediates */
        fs_reg zero = vgrf(glsl_type::double_type);
-      bld.MOV(zero, brw_imm_df(0.0));
+      bld.MOV(zero, setup_imm_df(bld, 0.0));
        /* A SIMD16 execution needs to be split in two instructions, so use
         * a vgrf instead of the flag register as dst so instruction splitting
         * works
@@ -1288,6 +1241,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        break;
     }
  
+   case nir_op_pack_int_2x32_split:
+      bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
+      break;
+
+   case nir_op_unpack_int_2x32_split_x:
+   case nir_op_unpack_int_2x32_split_y: {
+      if (instr->op == nir_op_unpack_int_2x32_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+      break;
+   }
+
     case nir_op_fpow:
        inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
        inst->saturate = instr->dest.saturate;
@@ -1303,26 +1269,57 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        bld.CBIT(result, op[0]);
        break;
  
-   case nir_op_ufind_msb:
+   case nir_op_ufind_msb: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_find_msb_using_lzd(bld, result, op[0], false);
+      break;
+   }
+
     case nir_op_ifind_msb: {
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
-      bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
  
-      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
-       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
-       * subtract the result from 31 to convert the MSB count into an LSB count.
-       */
-      bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+      if (devinfo->gen < 7) {
+         emit_find_msb_using_lzd(bld, result, op[0], true);
+      } else {
+         bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
  
-      inst = bld.ADD(result, result, brw_imm_d(31));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      inst->src[0].negate = true;
+         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
+          * count from the LSB side. If FBH didn't return an error
+          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
+          * count into an LSB count.
+          */
+         bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+         inst = bld.ADD(result, result, brw_imm_d(31));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->src[0].negate = true;
+      }
        break;
     }
  
     case nir_op_find_lsb:
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
-      bld.FBL(result, op[0]);
+
+      if (devinfo->gen < 7) {
+         fs_reg temp = vgrf(glsl_type::int_type);
+
+         /* (x & -x) generates a value that consists of only the LSB of x.
+          * For all powers of 2, findMSB(y) == findLSB(y).
+          */
+         fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
+         fs_reg negated_src = src;
+
+         /* One must be negated, and the other must be non-negated.  It
+          * doesn't matter which is which.
+          */
+         negated_src.negate = true;
+         src.negate = false;
+
+         bld.AND(temp, src, negated_src);
+         emit_find_msb_using_lzd(bld, result, temp, false);
+      } else {
+         bld.FBL(result, op[0]);
+      }
        break;
  
     case nir_op_ubitfield_extract:
@@ -1383,17 +1380,19 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
  
     case nir_op_extract_u8:
     case nir_op_extract_i8: {
+      const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
        nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
-      bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
-               result, op[0], brw_imm_ud(byte->u32[0]));
+      assert(byte != NULL);
+      bld.MOV(result, subscript(op[0], type, byte->u32[0]));
        break;
     }
  
     case nir_op_extract_u16:
     case nir_op_extract_i16: {
+      const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
        nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
-      bld.emit(SHADER_OPCODE_EXTRACT_WORD,
-               result, op[0], brw_imm_ud(word->u32[0]));
+      assert(word != NULL);
+      bld.MOV(result, subscript(op[0], type, word->u32[0]));
        break;
     }
  
@@ -1429,7 +1428,8 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
  
     case 64:
        for (unsigned i = 0; i < instr->def.num_components; i++)
-         bld.MOV(offset(reg, bld, i), brw_imm_df(instr->value.f64[i]));
+         bld.MOV(offset(reg, bld, i),
+                 setup_imm_df(bld, instr->value.f64[i]));
        break;
  
     default:
@@ -1439,21 +1439,18 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
     nir_ssa_values[instr->def.index] = reg;
  }
  
-void
-fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
-{
-   const brw_reg_type reg_type =
-      instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
-   nir_ssa_values[instr->def.index] =
-      bld.vgrf(reg_type, instr->def.num_components);
-}
-
  fs_reg
-fs_visitor::get_nir_src(nir_src src)
+fs_visitor::get_nir_src(const nir_src &src)
  {
     fs_reg reg;
     if (src.is_ssa) {
-      reg = nir_ssa_values[src.ssa->index];
+      if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+         const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
+            BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+         reg = bld.vgrf(reg_type, src.ssa->num_components);
+      } else {
+         reg = nir_ssa_values[src.ssa->index];
+      }
     } else {
        /* We don't handle indirects on locals */
        assert(src.reg.indirect == NULL);
@@ -1468,8 +1465,18 @@ fs_visitor::get_nir_src(nir_src src)
     return retype(reg, BRW_REGISTER_TYPE_D);
  }
  
+/**
+ * Return an IMM for constants; otherwise call get_nir_src() as normal.
+ */
+fs_reg
+fs_visitor::get_nir_src_imm(const nir_src &src)
+{
+   nir_const_value *val = nir_src_as_const_value(src);
+   return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
+}
+
  fs_reg
-fs_visitor::get_nir_dest(nir_dest dest)
+fs_visitor::get_nir_dest(const nir_dest &dest)
  {
     if (dest.is_ssa) {
        const brw_reg_type reg_type =
@@ -1617,8 +1624,10 @@ emit_pixel_interpolater_send(const fs_builder &bld,
                               const fs_reg &dst,
                               const fs_reg &src,
                               const fs_reg &desc,
-                             glsl_interp_qualifier interpolation)
+                             glsl_interp_mode interpolation)
  {
+   struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(bld.shader->stage_prog_data);
     fs_inst *inst;
     fs_reg payload;
     int mlen;
@@ -1635,8 +1644,10 @@ emit_pixel_interpolater_send(const fs_builder &bld,
     inst = bld.emit(opcode, dst, payload, desc);
     inst->mlen = mlen;
     /* 2 floats per slot returned */
-   inst->regs_written = 2 * bld.dispatch_width() / 8;
-   inst->pi_noperspective = interpolation == INTERP_QUALIFIER_NOPERSPECTIVE;
+   inst->size_written = 2 * dst.component_size(inst->exec_size);
+   inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
+
+   wm_prog_data->pulls_bary = true;
  
     return inst;
  }
@@ -1662,8 +1673,10 @@ fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
  {
     assert(stage == MESA_SHADER_GEOMETRY);
  
-   struct brw_gs_prog_data *gs_prog_data =
-      (struct brw_gs_prog_data *) prog_data;
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   if (gs_compile->control_data_header_size_bits == 0)
+      return;
  
     /* We can only do EndPrimitive() functionality when the control data
      * consists of cut bits.  Fortunately, the only time it isn't is when the
@@ -1721,8 +1734,7 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
     assert(stage == MESA_SHADER_GEOMETRY);
     assert(gs_compile->control_data_bits_per_vertex != 0);
  
-   struct brw_gs_prog_data *gs_prog_data =
-      (struct brw_gs_prog_data *) prog_data;
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
  
     const fs_builder abld = bld.annotate("emit control data bits");
     const fs_builder fwa_bld = bld.exec_all();
@@ -1778,7 +1790,7 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
        fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
        abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
        unsigned log2_bits_per_vertex =
-         _mesa_fls(gs_compile->control_data_bits_per_vertex);
+         util_last_bit(gs_compile->control_data_bits_per_vertex);
        abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
  
        if (per_slot_offset.file != BAD_FILE) {
@@ -1877,8 +1889,7 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
  {
     assert(stage == MESA_SHADER_GEOMETRY);
  
-   struct brw_gs_prog_data *gs_prog_data =
-      (struct brw_gs_prog_data *) prog_data;
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
  
     fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
     vertex_count.type = BRW_REGISTER_TYPE_UD;
@@ -1893,7 +1904,7 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
      * be recorded by transform feedback, we can simply discard all geometry
      * bound to these streams when transform feedback is disabled.
      */
-   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
+   if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
        return;
  
     /* If we're outputting 32 control data bits or less, then we can wait
@@ -1973,9 +1984,10 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
                                 const nir_src &vertex_src,
                                 unsigned base_offset,
                                 const nir_src &offset_src,
-                               unsigned num_components)
+                               unsigned num_components,
+                               unsigned first_component)
  {
-   struct brw_gs_prog_data *gs_prog_data = (struct brw_gs_prog_data *) prog_data;
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
  
     nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
     nir_const_value *offset_const = nir_src_as_const_value(offset_src);
@@ -2000,7 +2012,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
        } else {
           for (unsigned i = 0; i < num_components; i++) {
              bld.MOV(offset(dst, bld, i),
-                    fs_reg(ATTR, imm_offset + i, dst.type));
+                    fs_reg(ATTR, imm_offset + i + first_component, dst.type));
           }
        }
        return;
@@ -2047,12 +2059,12 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
  
           /* Use first_icp_handle as the base offset.  There is one register
            * of URB handles per vertex, so inform the register allocator that
-          * we might read up to nir->info.gs.vertices_in registers.
+          * we might read up to nir->info->gs.vertices_in registers.
            */
           bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
                    fs_reg(brw_vec8_grf(first_icp_handle, 0)),
                    fs_reg(icp_offset_bytes),
-                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
+                  brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
        }
     } else {
        assert(gs_prog_data->invocations > 1);
@@ -2078,42 +2090,105 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
  
           /* Use first_icp_handle as the base offset.  There is one DWord
            * of URB handles per vertex, so inform the register allocator that
-          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
+          * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
            */
           bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
                    fs_reg(brw_vec8_grf(first_icp_handle, 0)),
                    fs_reg(icp_offset_bytes),
-                  brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
+                  brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
                               REG_SIZE));
        }
     }
  
     fs_inst *inst;
-   if (offset_const) {
-      /* Constant indexing - use global offset. */
-      inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
-      inst->offset = base_offset + offset_const->u32[0];
-      inst->base_mrf = -1;
-      inst->mlen = 1;
-      inst->regs_written = num_components;
-   } else {
-      /* Indirect indexing - use per-slot offsets as well. */
-      const fs_reg srcs[] = { icp_handle, get_nir_src(offset_src) };
-      fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
-
-      inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
-      inst->offset = base_offset;
-      inst->base_mrf = -1;
-      inst->mlen = 2;
-      inst->regs_written = num_components;
+
+   fs_reg tmp_dst = dst;
+   fs_reg indirect_offset = get_nir_src(offset_src);
+   unsigned num_iterations = 1;
+   unsigned orig_num_components = num_components;
+
+   if (type_sz(dst.type) == 8) {
+      if (num_components > 2) {
+         num_iterations = 2;
+         num_components = 2;
+      }
+      fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
+      tmp_dst = tmp;
+      first_component = first_component / 2;
+   }
+
+   for (unsigned iter = 0; iter < num_iterations; iter++) {
+      if (offset_const) {
+         /* Constant indexing - use global offset. */
+         if (first_component != 0) {
+            unsigned read_components = num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
+            inst->size_written = read_components *
+                                 tmp.component_size(inst->exec_size);
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(tmp_dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
+                            icp_handle);
+            inst->size_written = num_components *
+                                 tmp_dst.component_size(inst->exec_size);
+         }
+         inst->offset = base_offset + offset_const->u32[0];
+         inst->mlen = 1;
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         unsigned read_components = num_components + first_component;
+         fs_reg tmp = bld.vgrf(dst.type, read_components);
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+         if (first_component != 0) {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                            payload);
+            inst->size_written = read_components *
+                                 tmp.component_size(inst->exec_size);
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(tmp_dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
+                         payload);
+            inst->size_written = num_components *
+                                 tmp_dst.component_size(inst->exec_size);
+         }
+         inst->offset = base_offset;
+         inst->mlen = 2;
+      }
+
+      if (type_sz(dst.type) == 8) {
+         shuffle_32bit_load_result_to_64bit_data(
+            bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
+
+         for (unsigned c = 0; c < num_components; c++)
+            bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+      }
+
+      if (num_iterations > 1) {
+         num_components = orig_num_components - 2;
+         if(offset_const) {
+            base_offset++;
+         } else {
+            fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
+            indirect_offset = new_indirect;
+         }
+      }
     }
  
     if (is_point_size) {
        /* Read the whole VUE header (because of alignment) and read .w. */
        fs_reg tmp = bld.vgrf(dst.type, 4);
        inst->dst = tmp;
-      inst->regs_written = 4;
+      inst->size_written = 4 * REG_SIZE;
        bld.MOV(dst, offset(tmp, bld, 3));
     }
  }
@@ -2226,6 +2301,37 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
        break;
     }
  
+   case nir_intrinsic_load_input: {
+      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
+      unsigned first_component = nir_intrinsic_component(instr);
+      unsigned num_components = instr->num_components;
+      enum brw_reg_type type = dest.type;
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      assert(const_offset && "Indirect input loads not allowed");
+      src = offset(src, bld, const_offset->u32[0]);
+
+      for (unsigned j = 0; j < num_components; j++) {
+         bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
+      }
+
+      if (type == BRW_REGISTER_TYPE_DF) {
+         /* Once the double vector is read, set again its original register
+          * type to continue with normal execution.
+          */
+         src = retype(src, type);
+         dest = retype(dest, type);
+      }
+
+      if (type_sz(src.type) == 8) {
+         shuffle_32bit_load_result_to_64bit_data(bld,
+                                                 dest,
+                                                 retype(dest, BRW_REGISTER_TYPE_F),
+                                                 instr->num_components);
+      }
+      break;
+   }
+
     default:
        nir_emit_intrinsic(bld, instr);
        break;
@@ -2238,8 +2344,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
  {
     assert(stage == MESA_SHADER_TESS_CTRL);
     struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
-   struct brw_tcs_prog_data *tcs_prog_data =
-      (struct brw_tcs_prog_data *) prog_data;
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
  
     fs_reg dst;
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -2262,23 +2367,23 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           break;
  
        fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
+      fs_reg m0_2 = component(m0, 2);
  
-      const fs_builder fwa_bld = bld.exec_all();
+      const fs_builder chanbld = bld.exec_all().group(1, 0);
  
        /* Zero the message header */
-      fwa_bld.MOV(m0, brw_imm_ud(0u));
+      bld.exec_all().MOV(m0, brw_imm_ud(0u));
  
        /* Copy "Barrier ID" from r0.2, bits 16:13 */
-      fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
                    brw_imm_ud(INTEL_MASK(16, 13)));
  
        /* Shift it up to bits 27:24. */
-      fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
+      chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
  
        /* Set the Barrier Count and the enable bit */
-      fwa_bld.OR(m0_2, m0_2,
-                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
+      chanbld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
  
        bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
        break;
@@ -2338,8 +2443,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
         */
        unsigned num_iterations = 1;
        unsigned num_components = instr->num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
        fs_reg orig_dst = dst;
        if (type_sz(dst.type) == 8) {
+         first_component = first_component / 2;
           if (instr->num_components > 2) {
              num_iterations = 2;
              num_components = 2;
@@ -2352,22 +2459,42 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
        for (unsigned iter = 0; iter < num_iterations; iter++) {
           if (indirect_offset.file == BAD_FILE) {
              /* Constant indexing - use global offset. */
-            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+            if (first_component != 0) {
+               unsigned read_components = num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
+               for (unsigned i = 0; i < num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+            }
              inst->offset = imm_offset;
              inst->mlen = 1;
-            inst->base_mrf = -1;
           } else {
              /* Indirect indexing - use per-slot offsets as well. */
              const fs_reg srcs[] = { icp_handle, indirect_offset };
              fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
              bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
-
-            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+            if (first_component != 0) {
+               unsigned read_components = num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                               payload);
+               for (unsigned i = 0; i < num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
+                               payload);
+            }
              inst->offset = imm_offset;
-            inst->base_mrf = -1;
              inst->mlen = 2;
           }
-         inst->regs_written = num_components * type_sz(dst.type) / 4;
+         inst->size_written = (num_components + first_component) *
+                              inst->dst.component_size(inst->exec_size);
  
           /* If we are reading 64-bit data using 32-bit read messages we need
            * build proper 64-bit data elements by shuffling the low and high
@@ -2391,7 +2518,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
              assert(type_sz(dst.type) < 8);
              inst->dst = bld.vgrf(dst.type, 4);
-            inst->regs_written = 4;
+            inst->size_written = 4 * REG_SIZE;
              bld.MOV(dst, offset(inst->dst, bld, 3));
           }
  
@@ -2400,13 +2527,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
            */
           if (num_iterations > 1) {
              num_components = instr->num_components - 2;
-            if (indirect_offset.file == BAD_FILE) {
-               imm_offset++;
-            } else {
-               fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-               bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
-               indirect_offset = new_indirect;
-            }
+            imm_offset++;
           }
        }
        break;
@@ -2416,6 +2537,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
     case nir_intrinsic_load_per_vertex_output: {
        fs_reg indirect_offset = get_indirect_offset(instr);
        unsigned imm_offset = instr->const_index[0];
+      unsigned first_component = nir_intrinsic_component(instr);
  
        fs_inst *inst;
        if (indirect_offset.file == BAD_FILE) {
@@ -2424,86 +2546,25 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           bld.MOV(patch_handle,
                   retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
  
-         if (imm_offset == 0) {
-            /* This is a read of gl_TessLevelInner[], which lives in the
-             * Patch URB header.  The layout depends on the domain.
-             */
-            dst.type = BRW_REGISTER_TYPE_F;
-            switch (tcs_key->tes_primitive_mode) {
-            case GL_QUADS: {
-               /* DWords 3-2 (reversed) */
-               fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
-
-               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
-               inst->offset = 0;
-               inst->mlen = 1;
-               inst->base_mrf = -1;
-               inst->regs_written = 4;
-
-               /* dst.xy = tmp.wz */
-               bld.MOV(dst,                 offset(tmp, bld, 3));
-               bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2));
-               break;
-            }
-            case GL_TRIANGLES:
-               /* DWord 4; hardcode offset = 1 and regs_written = 1 */
-               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
-               inst->offset = 1;
-               inst->mlen = 1;
-               inst->base_mrf = -1;
-               inst->regs_written = 1;
-               break;
-            case GL_ISOLINES:
-               /* All channels are undefined. */
-               break;
-            default:
-               unreachable("Bogus tessellation domain");
-            }
-         } else if (imm_offset == 1) {
-            /* This is a read of gl_TessLevelOuter[], which lives in the
-             * Patch URB header.  The layout depends on the domain.
-             */
-            dst.type = BRW_REGISTER_TYPE_F;
-
-            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
-            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
-            inst->offset = 1;
-            inst->mlen = 1;
-            inst->base_mrf = -1;
-            inst->regs_written = 4;
-
-            /* Reswizzle: WZYX */
-            fs_reg srcs[4] = {
-               offset(tmp, bld, 3),
-               offset(tmp, bld, 2),
-               offset(tmp, bld, 1),
-               offset(tmp, bld, 0),
-            };
-
-            unsigned num_components;
-            switch (tcs_key->tes_primitive_mode) {
-            case GL_QUADS:
-               num_components = 4;
-               break;
-            case GL_TRIANGLES:
-               num_components = 3;
-               break;
-            case GL_ISOLINES:
-               /* Isolines are not reversed; swizzle .zw -> .xy */
-               srcs[0] = offset(tmp, bld, 2);
-               srcs[1] = offset(tmp, bld, 3);
-               num_components = 2;
-               break;
-            default:
-               unreachable("Bogus tessellation domain");
+         {
+            if (first_component != 0) {
+               unsigned read_components =
+                  instr->num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+                               patch_handle);
+               inst->size_written = read_components * REG_SIZE;
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
+                               patch_handle);
+               inst->size_written = instr->num_components * REG_SIZE;
              }
-            bld.LOAD_PAYLOAD(dst, srcs, num_components, 0);
-         } else {
-            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
              inst->offset = imm_offset;
              inst->mlen = 1;
-            inst->base_mrf = -1;
-            inst->regs_written = instr->num_components;
           }
        } else {
           /* Indirect indexing - use per-slot offsets as well. */
@@ -2513,12 +2574,24 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           };
           fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
           bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
-
-         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         if (first_component != 0) {
+            unsigned read_components =
+               instr->num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                            payload);
+            inst->size_written = read_components * REG_SIZE;
+            for (unsigned i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
+                            payload);
+            inst->size_written = instr->num_components * REG_SIZE;
+         }
           inst->offset = imm_offset;
           inst->mlen = 2;
-         inst->base_mrf = -1;
-         inst->regs_written = instr->num_components;
        }
        break;
     }
@@ -2538,61 +2611,12 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
  
        if (indirect_offset.file != BAD_FILE) {
           srcs[header_regs++] = indirect_offset;
-      } else if (!is_passthrough_shader) {
-         if (imm_offset == 0) {
-            value.type = BRW_REGISTER_TYPE_F;
-
-            mask &= (1 << tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1;
-
-            /* This is a write to gl_TessLevelInner[], which lives in the
-             * Patch URB header.  The layout depends on the domain.
-             */
-            switch (tcs_key->tes_primitive_mode) {
-            case GL_QUADS:
-               /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
-                * We use an XXYX swizzle to reverse put .xy in the .wz
-                * channels, and use a .zw writemask.
-                */
-               mask = writemask_for_backwards_vector(mask);
-               swiz = BRW_SWIZZLE4(0, 0, 1, 0);
-               break;
-            case GL_TRIANGLES:
-               /* gl_TessLevelInner[].x lives at DWord 4, so we set the
-                * writemask to X and bump the URB offset by 1.
-                */
-               imm_offset = 1;
-               break;
-            case GL_ISOLINES:
-               /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
-               return;
-            default:
-               unreachable("Bogus tessellation domain");
-            }
-         } else if (imm_offset == 1) {
-            /* This is a write to gl_TessLevelOuter[] which lives in the
-             * Patch URB Header at DWords 4-7.  However, it's reversed, so
-             * instead of .xyzw we have .wzyx.
-             */
-            value.type = BRW_REGISTER_TYPE_F;
-
-            mask &= (1 << tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1;
-
-            if (tcs_key->tes_primitive_mode == GL_ISOLINES) {
-               /* Isolines .xy should be stored in .zw, in order. */
-               swiz = BRW_SWIZZLE4(0, 0, 0, 1);
-               mask <<= 2;
-            } else {
-               /* Other domains are reversed; store .wzyx instead of .xyzw */
-               swiz = BRW_SWIZZLE_WZYX;
-               mask = writemask_for_backwards_vector(mask);
-            }
-         }
        }
  
        if (mask == 0)
           break;
  
-      unsigned num_components = _mesa_fls(mask);
+      unsigned num_components = util_last_bit(mask);
        enum opcode opcode;
  
        /* We can only pack two 64-bit components in a single message, so send
@@ -2600,9 +2624,13 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
         */
        unsigned num_iterations = 1;
        unsigned iter_components = num_components;
-      if (is_64bit && instr->num_components > 2) {
-         num_iterations = 2;
-         iter_components = 2;
+      unsigned first_component = nir_intrinsic_component(instr);
+      if (is_64bit) {
+         first_component = first_component / 2;
+         if (instr->num_components > 2) {
+            num_iterations = 2;
+            iter_components = 2;
+         }
        }
  
        /* 64-bit data needs to me shuffled before we can write it to the URB.
@@ -2612,6 +2640,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
        fs_reg tmp =
           fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
  
+      mask = mask << first_component;
+
        for (unsigned iter = 0; iter < num_iterations; iter++) {
           if (!is_64bit && mask != WRITEMASK_XYZW) {
              srcs[header_regs++] = brw_imm_ud(mask << 16);
@@ -2649,11 +2679,12 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           }
  
           for (unsigned i = 0; i < iter_components; i++) {
-            if (!(mask & (1 << i)))
+            if (!(mask & (1 << (i + first_component))))
                 continue;
  
              if (!is_64bit) {
-               srcs[header_regs + i] = offset(value, bld, BRW_GET_SWZ(swiz, i));
+               srcs[header_regs + i + first_component] =
+                  offset(value, bld, BRW_GET_SWZ(swiz, i));
              } else {
                 /* We need to shuffle the 64-bit data to match the layout
                  * expected by our 32-bit URB write messages. We use a temporary
@@ -2670,13 +2701,15 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                 unsigned idx = 2 * i;
                 bld.MOV(dest, offset(tmp, bld, idx));
                 bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
-               srcs[header_regs + idx] = dest;
-               srcs[header_regs + idx + 1] = offset(dest, bld, 1);
+               srcs[header_regs + idx + first_component * 2] = dest;
+               srcs[header_regs + idx + 1 + first_component * 2] =
+                  offset(dest, bld, 1);
              }
           }
  
           unsigned mlen =
-            header_regs + (is_64bit ? 2 * iter_components : iter_components);
+            header_regs + (is_64bit ? 2 * iter_components : iter_components) +
+            (is_64bit ? 2 * first_component : first_component);
           fs_reg payload =
              bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
           bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
@@ -2684,7 +2717,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
           fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
           inst->offset = imm_offset;
           inst->mlen = mlen;
-         inst->base_mrf = -1;
  
           /* If this is a 64-bit attribute, select the next two 64-bit channels
            * to be handled in the next iteration.
@@ -2708,7 +2740,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
                                     nir_intrinsic_instr *instr)
  {
     assert(stage == MESA_SHADER_TESS_EVAL);
-   struct brw_tes_prog_data *tes_prog_data = (struct brw_tes_prog_data *) prog_data;
+   struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
  
     fs_reg dest;
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -2725,52 +2757,15 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
        }
        break;
  
-   case nir_intrinsic_load_tess_level_outer:
-      /* When the TES reads gl_TessLevelOuter, we ensure that the patch header
-       * appears as a push-model input.  So, we can simply use the ATTR file
-       * rather than issuing URB read messages.  The data is stored in the
-       * high DWords in reverse order - DWord 7 contains .x, DWord 6 contains
-       * .y, and so on.
-       */
-      switch (tes_prog_data->domain) {
-      case BRW_TESS_DOMAIN_QUAD:
-         for (unsigned i = 0; i < 4; i++)
-            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
-         break;
-      case BRW_TESS_DOMAIN_TRI:
-         for (unsigned i = 0; i < 3; i++)
-            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
-         break;
-      case BRW_TESS_DOMAIN_ISOLINE:
-         for (unsigned i = 0; i < 2; i++)
-            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
-         break;
-      }
-      break;
-
-   case nir_intrinsic_load_tess_level_inner:
-      /* When the TES reads gl_TessLevelInner, we ensure that the patch header
-       * appears as a push-model input.  So, we can simply use the ATTR file
-       * rather than issuing URB read messages.
-       */
-      switch (tes_prog_data->domain) {
-      case BRW_TESS_DOMAIN_QUAD:
-         bld.MOV(dest, component(fs_reg(ATTR, 0), 3));
-         bld.MOV(offset(dest, bld, 1), component(fs_reg(ATTR, 0), 2));
-         break;
-      case BRW_TESS_DOMAIN_TRI:
-         bld.MOV(dest, component(fs_reg(ATTR, 0), 4));
-         break;
-      case BRW_TESS_DOMAIN_ISOLINE:
-         /* ignore - value is undefined */
-         break;
-      }
-      break;
-
     case nir_intrinsic_load_input:
     case nir_intrinsic_load_per_vertex_input: {
        fs_reg indirect_offset = get_indirect_offset(instr);
        unsigned imm_offset = instr->const_index[0];
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      if (type_sz(dest.type) == 8) {
+         first_component = first_component / 2;
+      }
  
        fs_inst *inst;
        if (indirect_offset.file == BAD_FILE) {
@@ -2781,8 +2776,9 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
           if (imm_offset < max_push_slots) {
              fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
              for (int i = 0; i < instr->num_components; i++) {
-               bld.MOV(offset(dest, bld, i),
-                       component(src, 4 * (imm_offset % 2) + i));
+               unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
+                  i + first_component;
+               bld.MOV(offset(dest, bld, i), component(src, comp));
              }
              tes_prog_data->base.urb_read_length =
                 MAX2(tes_prog_data->base.urb_read_length,
@@ -2795,34 +2791,102 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
              fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
              bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
  
-            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
+            if (first_component != 0) {
+               unsigned read_components =
+                  instr->num_components + first_component;
+               fs_reg tmp = bld.vgrf(dest.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+                               patch_handle);
+               inst->size_written = read_components * REG_SIZE;
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
+                               patch_handle);
+               inst->size_written = instr->num_components * REG_SIZE;
+            }
              inst->mlen = 1;
              inst->offset = imm_offset;
-            inst->base_mrf = -1;
-            inst->regs_written = instr->num_components;
           }
        } else {
           /* Indirect indexing - use per-slot offsets as well. */
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
-            indirect_offset
-         };
-         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
  
-         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload);
-         inst->mlen = 2;
-         inst->offset = imm_offset;
-         inst->base_mrf = -1;
-         inst->regs_written = instr->num_components;
-      }
-      break;
-   }
-   default:
-      nir_emit_intrinsic(bld, instr);
-      break;
-   }
-}
+         /* We can only read two double components with each URB read, so
+          * we send two read messages in that case, each one loading up to
+          * two double components.
+          */
+         unsigned num_iterations = 1;
+         unsigned num_components = instr->num_components;
+         fs_reg orig_dest = dest;
+         if (type_sz(dest.type) == 8) {
+            if (instr->num_components > 2) {
+               num_iterations = 2;
+               num_components = 2;
+            }
+            fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
+            dest = tmp;
+         }
+
+         for (unsigned iter = 0; iter < num_iterations; iter++) {
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+               indirect_offset
+            };
+            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+            if (first_component != 0) {
+               unsigned read_components =
+                   num_components + first_component;
+               fs_reg tmp = bld.vgrf(dest.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                               payload);
+               for (unsigned i = 0; i < num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
+                               payload);
+            }
+            inst->mlen = 2;
+            inst->offset = imm_offset;
+            inst->size_written = (num_components + first_component) *
+                                 inst->dst.component_size(inst->exec_size);
+
+            /* If we are reading 64-bit data using 32-bit read messages we need
+             * build proper 64-bit data elements by shuffling the low and high
+             * 32-bit components around like we do for other things like UBOs
+             * or SSBOs.
+             */
+            if (type_sz(dest.type) == 8) {
+               shuffle_32bit_load_result_to_64bit_data(
+                  bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
+
+               for (unsigned c = 0; c < num_components; c++) {
+                  bld.MOV(offset(orig_dest, bld, iter * 2 + c),
+                          offset(dest, bld, c));
+               }
+            }
+
+            /* If we are loading double data and we need a second read message
+             * adjust the offset
+             */
+            if (num_iterations > 1) {
+               num_components = instr->num_components - 2;
+               imm_offset++;
+            }
+         }
+      }
+      break;
+   }
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
  
  void
  fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
@@ -2838,7 +2902,7 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
     switch (instr->intrinsic) {
     case nir_intrinsic_load_primitive_id:
        assert(stage == MESA_SHADER_GEOMETRY);
-      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
+      assert(brw_gs_prog_data(prog_data)->include_primitive_id);
        bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
                retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
        break;
@@ -2848,7 +2912,8 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
  
     case nir_intrinsic_load_per_vertex_input:
        emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
-                         instr->src[1], instr->num_components);
+                         instr->src[1], instr->num_components,
+                         nir_intrinsic_component(instr));
        break;
  
     case nir_intrinsic_emit_vertex_with_counter:
@@ -2877,14 +2942,167 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
     }
  }
  
+/**
+ * Fetch the current render target layer index.
+ */
+static fs_reg
+fetch_render_target_array_index(const fs_builder &bld)
+{
+   if (bld.shader->devinfo->gen >= 6) {
+      /* The render target array index is provided in the thread payload as
+       * bits 26:16 of r0.0.
+       */
+      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
+              brw_imm_uw(0x7ff));
+      return idx;
+   } else {
+      /* Pre-SNB we only ever render into the first layer of the framebuffer
+       * since layered rendering is not implemented.
+       */
+      return brw_imm_ud(0);
+   }
+}
+
+/**
+ * Fake non-coherent framebuffer read implemented using TXF to fetch from the
+ * framebuffer at the current fragment coordinates and sample index.
+ */
+fs_inst *
+fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
+                                      unsigned target)
+{
+   const struct gen_device_info *devinfo = bld.shader->devinfo;
+
+   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
+   const brw_wm_prog_key *wm_key =
+      reinterpret_cast<const brw_wm_prog_key *>(key);
+   assert(!wm_key->coherent_fb_fetch);
+   const struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(stage_prog_data);
+
+   /* Calculate the surface index relative to the start of the texture binding
+    * table block, since that's what the texturing messages expect.
+    */
+   const unsigned surface = target +
+      wm_prog_data->binding_table.render_target_read_start -
+      wm_prog_data->base.binding_table.texture_start;
+
+   brw_mark_surface_used(
+      bld.shader->stage_prog_data,
+      wm_prog_data->binding_table.render_target_read_start + target);
+
+   /* Calculate the fragment coordinates. */
+   const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.MOV(offset(coords, bld, 0), pixel_x);
+   bld.MOV(offset(coords, bld, 1), pixel_y);
+   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
+
+   /* Calculate the sample index and MCS payload when multisampling.  Luckily
+    * the MCS fetch message behaves deterministically for UMS surfaces, so it
+    * shouldn't be necessary to recompile based on whether the framebuffer is
+    * CMS or UMS.
+    */
+   if (wm_key->multisample_fbo &&
+       nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+   const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
+   const fs_reg mcs = wm_key->multisample_fbo ?
+      emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
+
+   /* Use either a normal or a CMS texel fetch message depending on whether
+    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
+    * message just in case the framebuffer uses 16x multisampling, it should
+    * be equivalent to the normal CMS fetch for lower multisampling modes.
+    */
+   const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
+                     devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
+                     SHADER_OPCODE_TXF_CMS_LOGICAL;
+
+   /* Emit the instruction. */
+   const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
+                           sample, mcs,
+                           brw_imm_ud(surface), brw_imm_ud(0),
+                           fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
+   STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
+
+   fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+   return inst;
+}
+
+/**
+ * Actual coherent framebuffer read implemented using the native render target
+ * read message.  Requires SKL+.
+ */
+static fs_inst *
+emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
+{
+   assert(bld.shader->devinfo->gen >= 9);
+   fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
+   inst->target = target;
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+   return inst;
+}
+
+static fs_reg
+alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
+{
+   if (n && regs[0].file != BAD_FILE) {
+      return regs[0];
+
+   } else {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
+
+      for (unsigned i = 0; i < n; i++)
+         regs[i] = tmp;
+
+      return tmp;
+   }
+}
+
+static fs_reg
+alloc_frag_output(fs_visitor *v, unsigned location)
+{
+   assert(v->stage == MESA_SHADER_FRAGMENT);
+   const brw_wm_prog_key *const key =
+      reinterpret_cast<const brw_wm_prog_key *>(v->key);
+   const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+   const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
+
+   if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
+      return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
+
+   else if (l == FRAG_RESULT_COLOR)
+      return alloc_temporary(v->bld, 4, v->outputs,
+                             MAX2(key->nr_color_regions, 1));
+
+   else if (l == FRAG_RESULT_DEPTH)
+      return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
+
+   else if (l == FRAG_RESULT_STENCIL)
+      return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
+
+   else if (l == FRAG_RESULT_SAMPLE_MASK)
+      return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
+
+   else if (l >= FRAG_RESULT_DATA0 &&
+            l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
+      return alloc_temporary(v->bld, 4,
+                             &v->outputs[l - FRAG_RESULT_DATA0], 1);
+
+   else
+      unreachable("Invalid location");
+}
+
  void
  fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
  {
     assert(stage == MESA_SHADER_FRAGMENT);
-   struct brw_wm_prog_data *wm_prog_data =
-      (struct brw_wm_prog_data *) prog_data;
-   const struct brw_wm_prog_key *wm_key = (const struct brw_wm_prog_key *) key;
  
     fs_reg dest;
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -2905,6 +3123,11 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
        break;
     }
  
+   case nir_intrinsic_load_layer_id:
+      dest.type = BRW_REGISTER_TYPE_UD;
+      bld.MOV(dest, fetch_render_target_array_index(bld));
+      break;
+
     case nir_intrinsic_load_helper_invocation:
     case nir_intrinsic_load_sample_mask_in:
     case nir_intrinsic_load_sample_id: {
@@ -2916,6 +3139,44 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
        break;
     }
  
+   case nir_intrinsic_store_output: {
+      const fs_reg src = get_nir_src(instr->src[0]);
+      const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      assert(const_offset && "Indirect output stores not allowed");
+      const unsigned location = nir_intrinsic_base(instr) +
+         SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
+      const fs_reg new_dest = retype(alloc_frag_output(this, location),
+                                     src.type);
+
+      for (unsigned j = 0; j < instr->num_components; j++)
+         bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
+                 offset(src, bld, j));
+
+      break;
+   }
+
+   case nir_intrinsic_load_output: {
+      const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
+                                   BRW_NIR_FRAG_OUTPUT_LOCATION);
+      assert(l >= FRAG_RESULT_DATA0);
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      assert(const_offset && "Indirect output loads not allowed");
+      const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
+      const fs_reg tmp = bld.vgrf(dest.type, 4);
+
+      if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
+         emit_coherent_fb_read(bld, tmp, target);
+      else
+         emit_non_coherent_fb_read(bld, tmp, target);
+
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         bld.MOV(offset(dest, bld, j),
+                 offset(tmp, bld, nir_intrinsic_component(instr) + j));
+      }
+
+      break;
+   }
+
     case nir_intrinsic_discard:
     case nir_intrinsic_discard_if: {
        /* We track our discarded pixels in f0.1.  By predicating on it, we can
@@ -2941,194 +3202,221 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
        break;
     }
  
-   case nir_intrinsic_interp_var_at_centroid:
-   case nir_intrinsic_interp_var_at_sample:
-   case nir_intrinsic_interp_var_at_offset: {
-      /* Handle ARB_gpu_shader5 interpolation intrinsics
-       *
-       * It's worth a quick word of explanation as to why we handle the full
-       * variable-based interpolation intrinsic rather than a lowered version
-       * with like we do for other inputs.  We have to do that because the way
-       * we set up inputs doesn't allow us to use the already setup inputs for
-       * interpolation.  At the beginning of the shader, we go through all of
-       * the input variables and do the initial interpolation and put it in
-       * the nir_inputs array based on its location as determined in
-       * nir_lower_io.  If the input isn't used, dead code cleans up and
-       * everything works fine.  However, when we get to the ARB_gpu_shader5
-       * interpolation intrinsics, we need to reinterpolate the input
-       * differently.  If we used an intrinsic that just had an index it would
-       * only give us the offset into the nir_inputs array.  However, this is
-       * useless because that value is post-interpolation and we need
-       * pre-interpolation.  In order to get the actual location of the bits
-       * we get from the vertex fetching hardware, we need the variable.
-       */
-      wm_prog_data->pulls_bary = true;
+   case nir_intrinsic_load_input: {
+      /* load_input is only used for flat inputs */
+      unsigned base = nir_intrinsic_base(instr);
+      unsigned component = nir_intrinsic_component(instr);
+      unsigned num_components = instr->num_components;
+      enum brw_reg_type type = dest.type;
+
+      /* Special case fields in the VUE header */
+      if (base == VARYING_SLOT_LAYER)
+         component = 1;
+      else if (base == VARYING_SLOT_VIEWPORT)
+         component = 2;
+
+      if (nir_dest_bit_size(instr->dest) == 64) {
+         /* const_index is in 32-bit type size units that could not be aligned
+          * with DF. We need to read the double vector as if it was a float
+          * vector of twice the number of components to fetch the right data.
+          */
+         type = BRW_REGISTER_TYPE_F;
+         num_components *= 2;
+      }
  
-      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
-      const glsl_interp_qualifier interpolation =
-         (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
+      for (unsigned int i = 0; i < num_components; i++) {
+         struct brw_reg interp = interp_reg(base, component + i);
+         interp = suboffset(interp, 3);
+         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
+                  retype(fs_reg(interp), type));
+      }
  
-      switch (instr->intrinsic) {
-      case nir_intrinsic_interp_var_at_centroid:
-         emit_pixel_interpolater_send(bld,
-                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
-                                      dst_xy,
-                                      fs_reg(), /* src */
-                                      brw_imm_ud(0u),
-                                      interpolation);
-         break;
+      if (nir_dest_bit_size(instr->dest) == 64) {
+         shuffle_32bit_load_result_to_64bit_data(bld,
+                                                 dest,
+                                                 retype(dest, type),
+                                                 instr->num_components);
+      }
+      break;
+   }
  
-      case nir_intrinsic_interp_var_at_sample: {
-         if (!wm_key->multisample_fbo) {
-            /* From the ARB_gpu_shader5 specification:
-             * "If multisample buffers are not available, the input varying
-             *  will be evaluated at the center of the pixel."
-             */
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_CENTROID,
-                                         dst_xy,
-                                         fs_reg(), /* src */
-                                         brw_imm_ud(0u),
-                                         interpolation);
-            break;
-         }
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample:
+      /* Do nothing - load_interpolated_input handling will handle it later. */
+      break;
  
-         nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
+   case nir_intrinsic_load_barycentric_at_sample: {
+      const glsl_interp_mode interpolation =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
  
-         if (const_sample) {
-            unsigned msg_data = const_sample->i32[0] << 4;
+      nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
  
+      if (const_sample) {
+         unsigned msg_data = const_sample->i32[0] << 4;
+
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                      dest,
+                                      fs_reg(), /* src */
+                                      brw_imm_ud(msg_data),
+                                      interpolation);
+      } else {
+         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
+                                          BRW_REGISTER_TYPE_UD);
+
+         if (nir_src_is_dynamically_uniform(instr->src[0])) {
+            const fs_reg sample_id = bld.emit_uniformize(sample_src);
+            const fs_reg msg_data = vgrf(glsl_type::uint_type);
+            bld.exec_all().group(1, 0)
+               .SHL(msg_data, sample_id, brw_imm_ud(4u));
              emit_pixel_interpolater_send(bld,
                                           FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                         dst_xy,
+                                         dest,
                                           fs_reg(), /* src */
-                                         brw_imm_ud(msg_data),
+                                         msg_data,
                                           interpolation);
           } else {
-            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
-                                             BRW_REGISTER_TYPE_UD);
-
-            if (nir_src_is_dynamically_uniform(instr->src[0])) {
-               const fs_reg sample_id = bld.emit_uniformize(sample_src);
-               const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0)
-                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
+            /* Make a loop that sends a message to the pixel interpolater
+             * for the sample number in each live channel. If there are
+             * multiple channels with the same sample number then these
+             * will be handled simultaneously with a single interation of
+             * the loop.
+             */
+            bld.emit(BRW_OPCODE_DO);
+
+            /* Get the next live sample number into sample_id_reg */
+            const fs_reg sample_id = bld.emit_uniformize(sample_src);
+
+            /* Set the flag register so that we can perform the send
+             * message on all channels that have the same sample number
+             */
+            bld.CMP(bld.null_reg_ud(),
+                    sample_src, sample_id,
+                    BRW_CONDITIONAL_EQ);
+            const fs_reg msg_data = vgrf(glsl_type::uint_type);
+            bld.exec_all().group(1, 0)
+               .SHL(msg_data, sample_id, brw_imm_ud(4u));
+            fs_inst *inst =
                 emit_pixel_interpolater_send(bld,
                                              FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                            dst_xy,
+                                            dest,
                                              fs_reg(), /* src */
                                              msg_data,
                                              interpolation);
-            } else {
-               /* Make a loop that sends a message to the pixel interpolater
-                * for the sample number in each live channel. If there are
-                * multiple channels with the same sample number then these
-                * will be handled simultaneously with a single interation of
-                * the loop.
-                */
-               bld.emit(BRW_OPCODE_DO);
-
-               /* Get the next live sample number into sample_id_reg */
-               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+            set_predicate(BRW_PREDICATE_NORMAL, inst);
  
-               /* Set the flag register so that we can perform the send
-                * message on all channels that have the same sample number
-                */
-               bld.CMP(bld.null_reg_ud(),
-                       sample_src, sample_id,
-                       BRW_CONDITIONAL_EQ);
-               const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0)
-                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
-               fs_inst *inst =
-                  emit_pixel_interpolater_send(bld,
-                                               FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                               dst_xy,
-                                               fs_reg(), /* src */
-                                               msg_data,
-                                               interpolation);
-               set_predicate(BRW_PREDICATE_NORMAL, inst);
-
-               /* Continue the loop if there are any live channels left */
-               set_predicate_inv(BRW_PREDICATE_NORMAL,
-                                 true, /* inverse */
-                                 bld.emit(BRW_OPCODE_WHILE));
-            }
+            /* Continue the loop if there are any live channels left */
+            set_predicate_inv(BRW_PREDICATE_NORMAL,
+                              true, /* inverse */
+                              bld.emit(BRW_OPCODE_WHILE));
           }
-
-         break;
        }
+      break;
+   }
  
-      case nir_intrinsic_interp_var_at_offset: {
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-
-         const bool flip = !wm_key->render_to_fbo;
+   case nir_intrinsic_load_barycentric_at_offset: {
+      const glsl_interp_mode interpolation =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
  
-         if (const_offset) {
-            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
-            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16 *
-                                        (flip ? -1 : 1)), 7) & 0xf;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
  
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
-                                         dst_xy,
-                                         fs_reg(), /* src */
-                                         brw_imm_ud(off_x | (off_y << 4)),
-                                         interpolation);
-         } else {
-            fs_reg src = vgrf(glsl_type::ivec2_type);
-            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
-                                       BRW_REGISTER_TYPE_F);
-            for (int i = 0; i < 2; i++) {
-               fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
-               fs_reg itemp = vgrf(glsl_type::int_type);
-               /* float to int */
-               bld.MOV(itemp, (i == 1 && flip) ? negate(temp) : temp);
-
-               /* Clamp the upper end of the range to +7/16.
-                * ARB_gpu_shader5 requires that we support a maximum offset
-                * of +0.5, which isn't representable in a S0.4 value -- if
-                * we didn't clamp it, we'd end up with -8/16, which is the
-                * opposite of what the shader author wanted.
-                *
-                * This is legal due to ARB_gpu_shader5's quantization
-                * rules:
-                *
-                * "Not all values of <offset> may be supported; x and y
-                * offsets may be rounded to fixed-point values with the
-                * number of fraction bits given by the
-                * implementation-dependent constant
-                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
-                */
-               set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
-            }
+      if (const_offset) {
+         unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
+         unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
  
-            const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
-            emit_pixel_interpolater_send(bld,
-                                         opcode,
-                                         dst_xy,
-                                         src,
-                                         brw_imm_ud(0u),
-                                         interpolation);
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+                                      dest,
+                                      fs_reg(), /* src */
+                                      brw_imm_ud(off_x | (off_y << 4)),
+                                      interpolation);
+      } else {
+         fs_reg src = vgrf(glsl_type::ivec2_type);
+         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
+                                    BRW_REGISTER_TYPE_F);
+         for (int i = 0; i < 2; i++) {
+            fs_reg temp = vgrf(glsl_type::float_type);
+            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
+            fs_reg itemp = vgrf(glsl_type::int_type);
+            /* float to int */
+            bld.MOV(itemp, temp);
+
+            /* Clamp the upper end of the range to +7/16.
+             * ARB_gpu_shader5 requires that we support a maximum offset
+             * of +0.5, which isn't representable in a S0.4 value -- if
+             * we didn't clamp it, we'd end up with -8/16, which is the
+             * opposite of what the shader author wanted.
+             *
+             * This is legal due to ARB_gpu_shader5's quantization
+             * rules:
+             *
+             * "Not all values of <offset> may be supported; x and y
+             * offsets may be rounded to fixed-point values with the
+             * number of fraction bits given by the
+             * implementation-dependent constant
+             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
+             */
+            set_condmod(BRW_CONDITIONAL_L,
+                        bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
           }
-         break;
+
+         const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+         emit_pixel_interpolater_send(bld,
+                                      opcode,
+                                      dest,
+                                      src,
+                                      brw_imm_ud(0u),
+                                      interpolation);
        }
+      break;
+   }
  
-      default:
-         unreachable("Invalid intrinsic");
+   case nir_intrinsic_load_interpolated_input: {
+      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
+         emit_fragcoord_interpolation(dest);
+         break;
        }
  
-      for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
-         src.type = dest.type;
+      assert(instr->src[0].ssa &&
+             instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
+      nir_intrinsic_instr *bary_intrinsic =
+         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
+      enum glsl_interp_mode interp_mode =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
+      fs_reg dst_xy;
+
+      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
+          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
+         /* Use the result of the PI message */
+         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
+      } else {
+         /* Use the delta_xy values computed from the payload */
+         enum brw_barycentric_mode bary =
+            brw_barycentric_mode(interp_mode, bary_intrin);
  
-         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
-         dest = offset(dest, bld, 1);
+         dst_xy = this->delta_xy[bary];
+      }
+
+      for (unsigned int i = 0; i < instr->num_components; i++) {
+         fs_reg interp =
+            fs_reg(interp_reg(nir_intrinsic_base(instr),
+                              nir_intrinsic_component(instr) + i));
+         interp.type = BRW_REGISTER_TYPE_F;
+         dest.type = BRW_REGISTER_TYPE_F;
+
+         if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
+            fs_reg tmp = vgrf(glsl_type::float_type);
+            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
+            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
+         } else {
+            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
+         }
        }
        break;
     }
+
     default:
        nir_emit_intrinsic(bld, instr);
        break;
@@ -3140,8 +3428,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
  {
     assert(stage == MESA_SHADER_COMPUTE);
-   struct brw_cs_prog_data *cs_prog_data =
-      (struct brw_cs_prog_data *) prog_data;
+   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
  
     fs_reg dest;
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -3334,31 +3621,41 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     switch (instr->intrinsic) {
     case nir_intrinsic_atomic_counter_inc:
     case nir_intrinsic_atomic_counter_dec:
-   case nir_intrinsic_atomic_counter_read: {
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_add:
+   case nir_intrinsic_atomic_counter_min:
+   case nir_intrinsic_atomic_counter_max:
+   case nir_intrinsic_atomic_counter_and:
+   case nir_intrinsic_atomic_counter_or:
+   case nir_intrinsic_atomic_counter_xor:
+   case nir_intrinsic_atomic_counter_exchange:
+   case nir_intrinsic_atomic_counter_comp_swap: {
+      if (stage == MESA_SHADER_FRAGMENT &&
+          instr->intrinsic != nir_intrinsic_atomic_counter_read)
+         brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+
        /* Get the arguments of the atomic intrinsic. */
        const fs_reg offset = get_nir_src(instr->src[0]);
        const unsigned surface = (stage_prog_data->binding_table.abo_start +
                                  instr->const_index[0]);
+      const fs_reg src0 = (info->num_srcs >= 2
+                           ? get_nir_src(instr->src[1]) : fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 3
+                           ? get_nir_src(instr->src[2]) : fs_reg());
        fs_reg tmp;
  
+      assert(info->num_srcs <= 3);
+
        /* Emit a surface read or atomic op. */
-      switch (instr->intrinsic) {
-      case nir_intrinsic_atomic_counter_read:
+      if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
           tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
-         break;
-
-      case nir_intrinsic_atomic_counter_inc:
-         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, fs_reg(),
-                                   fs_reg(), 1, 1, BRW_AOP_INC);
-         break;
-
-      case nir_intrinsic_atomic_counter_dec:
-         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, fs_reg(),
-                                   fs_reg(), 1, 1, BRW_AOP_PREDEC);
-         break;
-
-      default:
-         unreachable("Unreachable");
+      } else {
+         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
+                                   src1, 1, 1,
+                                   get_atomic_counter_op(instr->intrinsic));
        }
  
        /* Assign the result. */
@@ -3381,6 +3678,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     case nir_intrinsic_image_atomic_comp_swap: {
        using namespace image_access;
  
+      if (stage == MESA_SHADER_FRAGMENT &&
+          instr->intrinsic != nir_intrinsic_image_load)
+         brw_wm_prog_data(prog_data)->has_side_effects = true;
+
        /* Get the referenced image variable and type. */
        const nir_variable *var = instr->variables[0]->var;
        const glsl_type *type = var->type->without_array();
@@ -3428,9 +3729,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     case nir_intrinsic_memory_barrier_buffer:
     case nir_intrinsic_memory_barrier_image:
     case nir_intrinsic_memory_barrier: {
-      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
-      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
-         ->regs_written = 2;
+      const fs_builder ubld = bld.group(8, 0);
+      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->size_written = 2 * REG_SIZE;
        break;
     }
  
@@ -3459,9 +3761,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
  
     case nir_intrinsic_shader_clock: {
        /* We cannot do anything if there is an event, so ignore it for now */
-      fs_reg shader_clock = get_timestamp(bld);
-      const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) };
-
+      const fs_reg shader_clock = get_timestamp(bld);
+      const fs_reg srcs[] = { component(shader_clock, 0),
+                              component(shader_clock, 1) };
        bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
        break;
     }
@@ -3526,7 +3828,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        if (const_offset) {
           /* Offsets are in bytes but they should always be multiples of 4 */
           assert(const_offset->u32[0] % 4 == 0);
-         src.reg_offset = const_offset->u32[0] / 4;
+         src.offset = const_offset->u32[0];
  
           for (unsigned j = 0; j < instr->num_components; j++) {
              bld.MOV(offset(dest, bld, j), offset(src, bld, j));
@@ -3545,10 +3847,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           unsigned read_size = instr->const_index[1] -
              (instr->num_components - 1) * type_sz(dest.type);
  
+         fs_reg indirect_chv_high_32bit;
+         bool is_chv_bxt_64bit =
+            (devinfo->is_cherryview || devinfo->is_broxton) &&
+            type_sz(dest.type) == 8;
+         if (is_chv_bxt_64bit) {
+            indirect_chv_high_32bit = vgrf(glsl_type::uint_type);
+            /* Calculate indirect address to read high 32 bits */
+            bld.ADD(indirect_chv_high_32bit, indirect, brw_imm_ud(4));
+         }
+
           for (unsigned j = 0; j < instr->num_components; j++) {
-            bld.emit(SHADER_OPCODE_MOV_INDIRECT,
-                     offset(dest, bld, j), offset(src, bld, j),
-                     indirect, brw_imm_ud(read_size));
+            if (!is_chv_bxt_64bit) {
+               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                        offset(dest, bld, j), offset(src, bld, j),
+                        indirect, brw_imm_ud(read_size));
+            } else {
+               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                        subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, 0),
+                        offset(src, bld, j),
+                        indirect, brw_imm_ud(read_size));
+
+               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                        subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, 1),
+                        offset(src, bld, j),
+                        indirect_chv_high_32bit, brw_imm_ud(read_size));
+            }
           }
        }
        break;
@@ -3578,12 +3902,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            */
           brw_mark_surface_used(prog_data,
                                 stage_prog_data->binding_table.ubo_start +
-                               nir->info.num_ubos - 1);
+                               nir->info->num_ubos - 1);
        }
  
-      /* Number of 32-bit slots in the type */
-      unsigned type_slots = MAX2(1, type_sz(dest.type) / 4);
-
        nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
        if (const_offset == NULL) {
           fs_reg base_offset = retype(get_nir_src(instr->src[1]),
@@ -3601,55 +3922,29 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            * we let CSE deal with duplicate loads. Here we see a vector access
            * and we have to split it if necessary.
            */
-         fs_reg packed_consts = vgrf(glsl_type::float_type);
-         packed_consts.type = dest.type;
+         const unsigned type_size = type_sz(dest.type);
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
+         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
  
-         unsigned const_offset_aligned = const_offset->u32[0] & ~15;
+         for (unsigned c = 0; c < instr->num_components;) {
+            const unsigned base = const_offset->u32[0] + c * type_size;
+            /* Number of usable components in the next block-aligned load. */
+            const unsigned count = MIN2(instr->num_components - c,
+                                        (block_sz - base % block_sz) / type_size);
  
-         /* A vec4 only contains half of a dvec4, if we need more than 2
-          * components of a dvec4 we will have to issue another load for
-          * components z and w.
-          */
-         int num_components;
-         if (type_slots == 1)
-            num_components = instr->num_components;
-         else
-            num_components = MIN2(2, instr->num_components);
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                      packed_consts, surf_index,
+                      brw_imm_ud(base & ~(block_sz - 1)));
  
-         /* The computation of num_components doesn't take into account
-          * misalignment, which should be okay according to std140 vector
-          * alignment rules.
-          */
-         assert(const_offset->u32[0] % 16 +
-                type_sz(dest.type) * num_components <= 16);
-
-         int remaining_components = instr->num_components;
-         while (remaining_components > 0) {
-            /* Read the vec4 from a 16-byte aligned offset */
-            struct brw_reg const_offset_reg = brw_imm_ud(const_offset_aligned);
-            bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                     retype(packed_consts, BRW_REGISTER_TYPE_F),
-                     surf_index, const_offset_reg);
-
-            const fs_reg consts = byte_offset(packed_consts, (const_offset->u32[0] % 16));
-            unsigned dest_offset = instr->num_components - remaining_components;
-
-            /* XXX: This doesn't update the sub-16B offset across iterations of
-             * the loop, which should work for std140 vector alignment rules.
-             */
-            assert(dest_offset == 0 || const_offset->u32[0] % 16 == 0);
+            const fs_reg consts =
+               retype(byte_offset(packed_consts, base & (block_sz - 1)),
+                      dest.type);
  
-            for (int i = 0; i < num_components; i++)
-               bld.MOV(offset(dest, bld, i + dest_offset), component(consts, i));
+            for (unsigned d = 0; d < count; d++)
+               bld.MOV(offset(dest, bld, c + d), component(consts, d));
  
-            /* If this is a large enough 64-bit load, we will need to emit
-             * another message
-             */
-            remaining_components -= num_components;
-            assert(remaining_components == 0 ||
-                   (remaining_components <= 2 && type_slots == 2));
-            num_components = remaining_components;
-            const_offset_aligned += 16;
+            c += count;
           }
        }
        break;
@@ -3677,7 +3972,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            */
           brw_mark_surface_used(prog_data,
                                 stage_prog_data->binding_table.ssbo_start +
-                               nir->info.num_ssbos - 1);
+                               nir->info->num_ssbos - 1);
        }
  
        fs_reg offset_reg;
@@ -3695,28 +3990,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
-   case nir_intrinsic_load_input: {
-      fs_reg src;
-      if (stage == MESA_SHADER_VERTEX) {
-         src = fs_reg(ATTR, instr->const_index[0], dest.type);
-      } else {
-         src = offset(retype(nir_inputs, dest.type), bld,
-                      instr->const_index[0]);
-      }
-
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-      assert(const_offset && "Indirect input loads not allowed");
-      src = offset(src, bld, const_offset->u32[0]);
-
-      for (unsigned j = 0; j < instr->num_components; j++) {
-         bld.MOV(offset(dest, bld, j), offset(src, bld, j));
-      }
-      break;
-   }
-
     case nir_intrinsic_store_ssbo: {
        assert(devinfo->gen >= 7);
  
+      if (stage == MESA_SHADER_FRAGMENT)
+         brw_wm_prog_data(prog_data)->has_side_effects = true;
+
        /* Block index */
        fs_reg surf_index;
        nir_const_value *const_uniform_block =
@@ -3733,7 +4012,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
  
           brw_mark_surface_used(prog_data,
                                 stage_prog_data->binding_table.ssbo_start +
-                               nir->info.num_ssbos - 1);
+                               nir->info->num_ssbos - 1);
        }
  
        /* Value */
@@ -3808,14 +4087,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
  
     case nir_intrinsic_store_output: {
        fs_reg src = get_nir_src(instr->src[0]);
-      fs_reg new_dest = offset(retype(nir_outputs, src.type), bld,
-                               instr->const_index[0]);
  
        nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
        assert(const_offset && "Indirect output stores not allowed");
-      new_dest = offset(new_dest, bld, const_offset->u32[0]);
+      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+                                      4 * const_offset->u32[0]), src.type);
  
        unsigned num_components = instr->num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
        unsigned bit_size = instr->src[0].is_ssa ?
           instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
        if (bit_size == 64) {
@@ -3829,7 +4108,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        }
  
        for (unsigned j = 0; j < num_components; j++) {
-         bld.MOV(offset(new_dest, bld, j), offset(src, bld, j));
+         bld.MOV(offset(new_dest, bld, j + first_component),
+                 offset(src, bld, j));
        }
        break;
     }
@@ -3868,39 +4148,49 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     case nir_intrinsic_get_buffer_size: {
        nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
        unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
-      int reg_width = dispatch_width / 8;
  
-      /* Set LOD = 0 */
-      fs_reg source = brw_imm_d(0);
+      /* A resinfo's sampler message is used to get the buffer size.  The
+       * SIMD8's writeback message consists of four registers and SIMD16's
+       * writeback message consists of 8 destination registers (two per each
+       * component).  Because we are only interested on the first channel of
+       * the first returned component, where resinfo returns the buffer size
+       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
+       * the dispatch width.
+       */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+      fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
  
-      int mlen = 1 * reg_width;
+      /* Set LOD = 0 */
+      ubld.MOV(src_payload, brw_imm_d(0));
  
-      /* A resinfo's sampler message is used to get the buffer size.
-       * The SIMD8's writeback message consists of four registers and
-       * SIMD16's writeback message consists of 8 destination registers
-       * (two per each component), although we are only interested on the
-       * first component, where resinfo returns the buffer size for
-       * SURFTYPE_BUFFER.
-       */
-      int regs_written = 4 * mlen;
-      fs_reg src_payload = fs_reg(VGRF, alloc.allocate(mlen),
-                                  BRW_REGISTER_TYPE_UD);
-      bld.LOAD_PAYLOAD(src_payload, &source, 1, 0);
-      fs_reg buffer_size = fs_reg(VGRF, alloc.allocate(regs_written),
-                                  BRW_REGISTER_TYPE_UD);
        const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
-      fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, buffer_size,
-                               src_payload, brw_imm_ud(index));
+      fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
+                                src_payload, brw_imm_ud(index));
        inst->header_size = 0;
-      inst->mlen = mlen;
-      inst->regs_written = regs_written;
-      bld.emit(inst);
-      bld.MOV(retype(dest, buffer_size.type), buffer_size);
+      inst->mlen = 1;
+      inst->size_written = 4 * REG_SIZE;
  
+      bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
        brw_mark_surface_used(prog_data, index);
        break;
     }
  
+   case nir_intrinsic_load_channel_num: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      const fs_builder allbld8 = bld.group(8, 0).exec_all();
+      allbld8.MOV(tmp, brw_imm_v(0x76543210));
+      if (dispatch_width > 8)
+         allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
+      if (dispatch_width > 16) {
+         const fs_builder allbld16 = bld.group(16, 0).exec_all();
+         allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
+      }
+      bld.MOV(dest, tmp);
+      break;
+   }
+
     default:
        unreachable("unknown intrinsic");
     }
@@ -3910,6 +4200,9 @@ void
  fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
                                   int op, nir_intrinsic_instr *instr)
  {
+   if (stage == MESA_SHADER_FRAGMENT)
+      brw_wm_prog_data(prog_data)->has_side_effects = true;
+
     fs_reg dest;
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
        dest = get_nir_dest(instr->dest);
@@ -3931,7 +4224,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
         */
        brw_mark_surface_used(prog_data,
                              stage_prog_data->binding_table.ssbo_start +
-                            nir->info.num_ssbos - 1);
+                            nir->info->num_ssbos - 1);
     }
  
     fs_reg offset = get_nir_src(instr->src[1]);
@@ -3940,7 +4233,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
     if (op == BRW_AOP_CMPWR)
        data2 = get_nir_src(instr->src[3]);
  
-   /* Emit the actual atomic operation operation */
+   /* Emit the actual atomic operation */
  
     fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
                                                data1, data2,
@@ -3960,12 +4253,23 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
        dest = get_nir_dest(instr->dest);
  
     fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
-   fs_reg offset = get_nir_src(instr->src[0]);
+   fs_reg offset;
     fs_reg data1 = get_nir_src(instr->src[1]);
     fs_reg data2;
     if (op == BRW_AOP_CMPWR)
        data2 = get_nir_src(instr->src[2]);
  
+   /* Get the offset */
+   nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+   if (const_offset) {
+      offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+   } else {
+      offset = vgrf(glsl_type::uint_type);
+      bld.ADD(offset,
+             retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+             brw_imm_ud(instr->const_index[0]));
+   }
+
     /* Emit the actual atomic operation operation */
  
     fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
@@ -3994,13 +4298,15 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
     if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
        srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
  
+   uint32_t header_bits = 0;
     for (unsigned i = 0; i < instr->num_srcs; i++) {
        fs_reg src = get_nir_src(instr->src[i].src);
        switch (instr->src[i].src_type) {
        case nir_tex_src_bias:
-         srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
+         srcs[TEX_LOGICAL_SRC_LOD] =
+            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
           break;
-      case nir_tex_src_comparitor:
+      case nir_tex_src_comparator:
           srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
           break;
        case nir_tex_src_coord:
@@ -4026,13 +4332,16 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        case nir_tex_src_lod:
           switch (instr->op) {
           case nir_texop_txs:
-            srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_UD);
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
              break;
           case nir_texop_txf:
-            srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_D);
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
              break;
           default:
-            srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
              break;
           }
           break;
@@ -4043,12 +4352,14 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        case nir_tex_src_offset: {
           nir_const_value *const_offset =
              nir_src_as_const_value(instr->src[i].src);
-         if (const_offset) {
-            unsigned header_bits = brw_texture_offset(const_offset->i32, 3);
-            if (header_bits != 0)
-               srcs[TEX_LOGICAL_SRC_OFFSET_VALUE] = brw_imm_ud(header_bits);
+         unsigned offset_bits = 0;
+         if (const_offset &&
+             brw_texture_offset(const_offset->i32,
+                                nir_tex_instr_src_size(instr, i),
+                                &offset_bits)) {
+            header_bits |= offset_bits;
           } else {
-            srcs[TEX_LOGICAL_SRC_OFFSET_VALUE] =
+            srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
                 retype(src, BRW_REGISTER_TYPE_D);
           }
           break;
@@ -4087,6 +4398,19 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
           srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
           break;
  
+      case nir_tex_src_plane: {
+         nir_const_value *const_plane =
+            nir_src_as_const_value(instr->src[i].src);
+         const uint32_t plane = const_plane->u32[0];
+         const uint32_t texture_index =
+            instr->texture_index +
+            stage_prog_data->binding_table.plane_start[plane] -
+            stage_prog_data->binding_table.texture_start;
+
+         srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
+         break;
+      }
+
        default:
           unreachable("unknown texture source");
        }
@@ -4109,9 +4433,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
     srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
     srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
  
-   if (instr->op == nir_texop_query_levels) {
-      /* textureQueryLevels() is implemented in terms of TXS so we need to
-       * pass a valid LOD argument.
+   if (instr->op == nir_texop_query_levels ||
+       (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
+      /* textureQueryLevels() and texture() are implemented in terms of TXS
+       * and TXL respectively, so we need to pass a valid LOD argument.
         */
        assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
        srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
@@ -4120,7 +4445,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
     enum opcode opcode;
     switch (instr->op) {
     case nir_texop_tex:
-      opcode = SHADER_OPCODE_TEX_LOGICAL;
+      opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
+                SHADER_OPCODE_TXL_LOGICAL);
        break;
     case nir_texop_txb:
        opcode = FS_OPCODE_TXB_LOGICAL;
@@ -4151,29 +4477,14 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        opcode = SHADER_OPCODE_LOD_LOGICAL;
        break;
     case nir_texop_tg4:
-      if (srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].file != BAD_FILE &&
-          srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].file != IMM)
+      if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
           opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
        else
           opcode = SHADER_OPCODE_TG4_LOGICAL;
        break;
-   case nir_texop_texture_samples: {
-      fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
-
-      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D, 4);
-      fs_inst *inst = bld.emit(SHADER_OPCODE_SAMPLEINFO, tmp,
-                               bld.vgrf(BRW_REGISTER_TYPE_D, 1),
-                               srcs[TEX_LOGICAL_SRC_SURFACE],
-                               srcs[TEX_LOGICAL_SRC_SURFACE]);
-      inst->mlen = 1;
-      inst->header_size = 1;
-      inst->base_mrf = -1;
-      inst->regs_written = 4 * (dispatch_width / 8);
-
-      /* Pick off the one component we care about */
-      bld.MOV(dst, tmp);
-      return;
-   }
+   case nir_texop_texture_samples:
+      opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
+      break;
     case nir_texop_samples_identical: {
        fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
  
@@ -4197,8 +4508,21 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        unreachable("unknown texture opcode");
     }
  
+   if (instr->op == nir_texop_tg4) {
+      if (instr->component == 1 &&
+          key_tex->gather_channel_quirk_mask & (1 << texture)) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         header_bits |= 2 << 16;
+      } else {
+         header_bits |= instr->component << 16;
+      }
+   }
+
     fs_reg dst = bld.vgrf(brw_type_for_nir_type(instr->dest_type), 4);
     fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->offset = header_bits;
  
     const unsigned dest_size = nir_tex_instr_dest_size(instr);
     if (devinfo->gen >= 9 &&
@@ -4207,56 +4531,31 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
                              nir_ssa_def_components_read(&instr->dest.ssa):
                              (1 << dest_size) - 1;
        assert(write_mask != 0); /* dead code should have been eliminated */
-      inst->regs_written = _mesa_fls(write_mask) * dispatch_width / 8;
+      inst->size_written = util_last_bit(write_mask) *
+                           inst->dst.component_size(inst->exec_size);
     } else {
-      inst->regs_written = 4 * dispatch_width / 8;
+      inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
     }
  
     if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
        inst->shadow_compare = true;
  
-   if (srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].file == IMM)
-      inst->offset = srcs[TEX_LOGICAL_SRC_OFFSET_VALUE].ud;
-
-   if (instr->op == nir_texop_tg4) {
-      if (instr->component == 1 &&
-          key_tex->gather_channel_quirk_mask & (1 << texture)) {
-         /* gather4 sampler is broken for green channel on RG32F --
-          * we must ask for blue instead.
-          */
-         inst->offset |= 2 << 16;
-      } else {
-         inst->offset |= instr->component << 16;
-      }
-
-      if (devinfo->gen == 6)
-         emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
-   }
+   if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
+      emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
  
     fs_reg nir_dest[4];
     for (unsigned i = 0; i < dest_size; i++)
        nir_dest[i] = offset(dst, bld, i);
  
-   bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
-                        instr->is_array;
-
     if (instr->op == nir_texop_query_levels) {
        /* # levels is in .w */
        nir_dest[0] = offset(dst, bld, 3);
-   } else if (instr->op == nir_texop_txs && dest_size >= 3 &&
-              (devinfo->gen < 7 || is_cube_array)) {
+   } else if (instr->op == nir_texop_txs &&
+              dest_size >= 3 && devinfo->gen < 7) {
+      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
        fs_reg depth = offset(dst, bld, 2);
-      fs_reg fixed_depth = vgrf(glsl_type::int_type);
-
-      if (is_cube_array) {
-         /* fixup #layers for cube map arrays */
-         bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
-      } else if (devinfo->gen < 7) {
-         /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
-         bld.emit_minmax(fixed_depth, depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
-      }
-
-      nir_dest[2] = fixed_depth;
+      nir_dest[2] = vgrf(glsl_type::int_type);
+      bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
     }
  
     bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
@@ -4354,7 +4653,9 @@ shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
     assert(type_sz(src.type) == 8);
     assert(type_sz(dst.type) == 4);
  
-   assert(!src.in_range(dst, 2 * components * bld.dispatch_width() / 8));
+   assert(!regions_overlap(
+             dst, 2 * components * dst.component_size(bld.dispatch_width()),
+             src, components * src.component_size(bld.dispatch_width())));
  
     for (unsigned i = 0; i < components; i++) {
        const fs_reg component_i = offset(src, bld, i);
@@ -4362,3 +4663,51 @@ shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
        bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
     }
  }
+
+fs_reg
+setup_imm_df(const fs_builder &bld, double v)
+{
+   const struct gen_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->gen >= 7);
+
+   if (devinfo->gen >= 8)
+      return brw_imm_df(v);
+
+   /* gen7.5 does not support DF immediates straighforward but the DIM
+    * instruction allows to set the 64-bit immediate value.
+    */
+   if (devinfo->is_haswell) {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
+      ubld.DIM(dst, brw_imm_df(v));
+      return component(dst, 0);
+   }
+
+   /* gen7 does not support DF immediates, so we generate a 64-bit constant by
+    * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
+    * the high 32-bit to suboffset 4 and then applying a stride of 0.
+    *
+    * Alternatively, we could also produce a normal VGRF (without stride 0)
+    * by writing to all the channels in the VGRF, however, that would hit the
+    * gen7 bug where we have to split writes that span more than 1 register
+    * into instructions with a width of 4 (otherwise the write to the second
+    * register written runs into an execmask hardware bug) which isn't very
+    * nice.
+    */
+   union {
+      double d;
+      struct {
+         uint32_t i1;
+         uint32_t i2;
+      };
+   } di;
+
+   di.d = v;
+
+   const fs_builder ubld = bld.exec_all().group(1, 0);
+   const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+   ubld.MOV(tmp, brw_imm_ud(di.i1));
+   ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
+
+   return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
+}