i965/vec4: Simplify opt_reduce_swizzle() using the swizzle utils.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_nir.cpp
index e88c71bc0e87cc3bf8002880b7fc6325707c96b4..69f296ce8c041de12f5e4a9f099e24dea8a11e30 100644 (file)
@@ -25,6 +25,7 @@
 #include "glsl/ir_optimization.h"
 #include "glsl/nir/glsl_to_nir.h"
 #include "brw_fs.h"
+#include "brw_nir.h"
 
 static void
 nir_optimize(nir_shader *nir)
@@ -52,6 +53,8 @@ nir_optimize(nir_shader *nir)
       nir_validate_shader(nir);
       progress |= nir_opt_constant_folding(nir);
       nir_validate_shader(nir);
+      progress |= nir_opt_remove_phis(nir);
+      nir_validate_shader(nir);
    } while (progress);
 }
 
@@ -80,9 +83,12 @@ count_nir_instrs(nir_shader *nir)
 void
 fs_visitor::emit_nir_code()
 {
+   const nir_shader_compiler_options *options =
+      ctx->Const.ShaderCompilerOptions[stage].NirOptions;
+
    /* first, lower the GLSL IR shader to NIR */
    lower_output_reads(shader->base.ir);
-   nir_shader *nir = glsl_to_nir(shader->base.ir, NULL, true);
+   nir_shader *nir = glsl_to_nir(&shader->base, options);
    nir_validate_shader(nir);
 
    nir_lower_global_vars_to_local(nir);
@@ -97,10 +103,16 @@ fs_visitor::emit_nir_code()
    nir_lower_var_copies(nir);
    nir_validate_shader(nir);
 
-   nir_lower_io(nir);
-   nir_validate_shader(nir);
+   /* Get rid of split copies */
+   nir_optimize(nir);
 
-   nir_lower_locals_to_regs(nir);
+   nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
+                                                &num_direct_uniforms,
+                                                &nir->num_uniforms);
+   nir_assign_var_locations_scalar(&nir->inputs, &nir->num_inputs);
+   nir_assign_var_locations_scalar(&nir->outputs, &nir->num_outputs);
+
+   nir_lower_io(nir);
    nir_validate_shader(nir);
 
    nir_remove_dead_variables(nir);
@@ -117,13 +129,16 @@ fs_visitor::emit_nir_code()
 
    nir_optimize(nir);
 
+   nir_lower_locals_to_regs(nir);
+   nir_validate_shader(nir);
+
    nir_lower_to_source_mods(nir);
    nir_validate_shader(nir);
    nir_copy_prop(nir);
    nir_validate_shader(nir);
 
-   if (INTEL_DEBUG & DEBUG_WM) {
-      fprintf(stderr, "NIR (SSA form) for fragment shader:\n");
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "NIR (SSA form) for %s shader:\n", stage_name);
       nir_print_shader(nir, stderr);
    }
 
@@ -133,14 +148,21 @@ fs_visitor::emit_nir_code()
                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
                      MESA_DEBUG_TYPE_OTHER,
                      MESA_DEBUG_SEVERITY_NOTIFICATION,
-                     "FS NIR shader: %d inst\n",
+                     "%s NIR shader: %d inst\n",
+                     stage_abbrev,
                      count_nir_instrs(nir));
    }
 
    nir_convert_from_ssa(nir);
    nir_validate_shader(nir);
-   nir_lower_vec_to_movs(nir);
-   nir_validate_shader(nir);
+
+   /* This is the last pass we run before we start emitting stuff.  It
+    * determines when we need to insert boolean resolves on Gen <= 5.  We
+    * run it last because it stashes data in instr->pass_flags and we don't
+    * want that to be squashed by other NIR passes.
+    */
+   if (brw->gen <= 5)
+      brw_nir_analyze_boolean_resolves(nir);
 
    /* emit the arrays used for inputs and outputs - load/store intrinsics will
     * be converted to reads/writes of these arrays
@@ -157,7 +179,6 @@ fs_visitor::emit_nir_code()
    }
 
    if (nir->num_uniforms > 0) {
-      nir_uniforms = fs_reg(UNIFORM, 0);
       nir_setup_uniforms(nir);
    }
 
@@ -178,8 +199,8 @@ fs_visitor::emit_nir_code()
       nir_emit_impl(overload->impl);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
-      fprintf(stderr, "NIR (final form) for fragment shader:\n");
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "NIR (final form) for %s shader:\n", stage_name);
       nir_print_shader(nir, stderr);
    }
 
@@ -189,24 +210,49 @@ fs_visitor::emit_nir_code()
 void
 fs_visitor::nir_setup_inputs(nir_shader *shader)
 {
-   struct hash_entry *entry;
-   hash_table_foreach(shader->inputs, entry) {
-      nir_variable *var = (nir_variable *) entry->data;
-      fs_reg varying = offset(nir_inputs, var->data.driver_location);
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      enum brw_reg_type type = brw_type_for_base_type(var->type);
+      fs_reg input = offset(nir_inputs, var->data.driver_location);
 
       fs_reg reg;
-      if (!strcmp(var->name, "gl_FragCoord")) {
-         reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
-                                             var->data.origin_upper_left);
-         emit_percomp(MOV(varying, reg), 0xF);
-      } else if (!strcmp(var->name, "gl_FrontFacing")) {
-         reg = *emit_frontfacing_interpolation();
-         emit(MOV(retype(varying, BRW_REGISTER_TYPE_UD), reg));
-      } else {
-         emit_general_interpolation(varying, var->name, var->type,
-                                    (glsl_interp_qualifier) var->data.interpolation,
-                                    var->data.location, var->data.centroid,
-                                    var->data.sample);
+      switch (stage) {
+      case MESA_SHADER_VERTEX: {
+         /* Our ATTR file is indexed by VERT_ATTRIB_*, which is the value
+          * stored in nir_variable::location.
+          *
+          * However, NIR's load_input intrinsics use a different index - an
+          * offset into a single contiguous array containing all inputs.
+          * This index corresponds to the nir_variable::driver_location field.
+          *
+          * So, we need to copy from fs_reg(ATTR, var->location) to
+          * offset(nir_inputs, var->data.driver_location).
+          */
+         unsigned components = var->type->without_array()->components();
+         unsigned array_length = var->type->is_array() ? var->type->length : 1;
+         for (unsigned i = 0; i < array_length; i++) {
+            for (unsigned j = 0; j < components; j++) {
+               emit(MOV(retype(offset(input, components * i + j), type),
+                        offset(fs_reg(ATTR, var->data.location + i, type), j)));
+            }
+         }
+         break;
+      }
+      case MESA_SHADER_GEOMETRY:
+      case MESA_SHADER_COMPUTE:
+         unreachable("fs_visitor not used for these stages yet.");
+         break;
+      case MESA_SHADER_FRAGMENT:
+         if (var->data.location == VARYING_SLOT_POS) {
+            reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
+                                                var->data.origin_upper_left);
+            emit_percomp(MOV(input, reg), 0xF);
+         } else {
+            emit_general_interpolation(input, var->name, var->type,
+                                       (glsl_interp_qualifier) var->data.interpolation,
+                                       var->data.location, var->data.centroid,
+                                       var->data.sample);
+         }
+         break;
       }
    }
 }
@@ -216,12 +262,20 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 {
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
-   struct hash_entry *entry;
-   hash_table_foreach(shader->outputs, entry) {
-      nir_variable *var = (nir_variable *) entry->data;
+   foreach_list_typed(nir_variable, var, node, &shader->outputs) {
       fs_reg reg = offset(nir_outputs, var->data.driver_location);
 
-      if (var->data.index > 0) {
+      int vector_elements =
+         var->type->is_array() ? var->type->fields.array->vector_elements
+                               : var->type->vector_elements;
+
+      if (stage == MESA_SHADER_VERTEX) {
+         for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
+            int output = var->data.location + i;
+            this->outputs[output] = offset(reg, 4 * i);
+            this->output_components[output] = vector_elements;
+         }
+      } else if (var->data.index > 0) {
          assert(var->data.location == FRAG_RESULT_DATA0);
          assert(var->data.index == 1);
          this->dual_src_output = reg;
@@ -241,10 +295,6 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
          assert(var->data.location >= FRAG_RESULT_DATA0 &&
                 var->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
 
-         int vector_elements =
-            var->type->is_array() ? var->type->fields.array->vector_elements
-                                  : var->type->vector_elements;
-
          /* General color output. */
          for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
             int output = var->data.location - FRAG_RESULT_DATA0 + i;
@@ -259,15 +309,18 @@ void
 fs_visitor::nir_setup_uniforms(nir_shader *shader)
 {
    uniforms = shader->num_uniforms;
-   param_size[0] = shader->num_uniforms;
+
+   /* We split the uniform register file in half.  The first half is
+    * entirely direct uniforms.  The second half is indirect.
+    */
+   param_size[0] = num_direct_uniforms;
+   if (shader->num_uniforms > num_direct_uniforms)
+      param_size[num_direct_uniforms] = shader->num_uniforms - num_direct_uniforms;
 
    if (dispatch_width != 8)
       return;
 
-   struct hash_entry *entry;
-   hash_table_foreach(shader->uniforms, entry) {
-      nir_variable *var = (nir_variable *) entry->data;
-
+   foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
       /* UBO's and atomics don't take up space in the uniform file */
 
       if (var->interface_type != NULL || var->type->contains_atomic())
@@ -359,6 +412,30 @@ emit_system_values_block(nir_block *block, void *void_visitor)
 
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
+      case nir_intrinsic_load_vertex_id:
+         unreachable("should be lowered by lower_vertex_id().");
+
+      case nir_intrinsic_load_vertex_id_zero_base:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+         break;
+
+      case nir_intrinsic_load_base_vertex:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
+         break;
+
+      case nir_intrinsic_load_instance_id:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
+         break;
+
       case nir_intrinsic_load_sample_pos:
          assert(v->stage == MESA_SHADER_FRAGMENT);
          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
@@ -418,6 +495,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
 void
 fs_visitor::nir_emit_cf_list(exec_list *list)
 {
+   exec_list_validate(list);
    foreach_list_typed(nir_cf_node, node, node, list) {
       switch (node->type) {
       case nir_cf_node_if:
@@ -444,7 +522,7 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
    /* first, put the condition into f0 */
    fs_inst *inst = emit(MOV(reg_null_d,
                             retype(get_nir_src(if_stmt->condition),
-                                   BRW_REGISTER_TYPE_UD)));
+                                   BRW_REGISTER_TYPE_D)));
    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 
    emit(IF(BRW_PREDICATE_NORMAL));
@@ -520,9 +598,9 @@ static brw_reg_type
 brw_type_for_nir_type(nir_alu_type type)
 {
    switch (type) {
-   case nir_type_bool:
    case nir_type_unsigned:
       return BRW_REGISTER_TYPE_UD;
+   case nir_type_bool:
    case nir_type_int:
       return BRW_REGISTER_TYPE_D;
    case nir_type_float:
@@ -534,40 +612,187 @@ brw_type_for_nir_type(nir_alu_type type)
    return BRW_REGISTER_TYPE_F;
 }
 
+bool
+fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
+                                         const fs_reg &result)
+{
+   if (instr->src[0].src.is_ssa ||
+       !instr->src[0].src.reg.reg ||
+       !instr->src[0].src.reg.reg->parent_instr)
+      return false;
+
+   if (instr->src[0].src.reg.reg->parent_instr->type !=
+       nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *src0 =
+      nir_instr_as_intrinsic(instr->src[0].src.reg.reg->parent_instr);
+
+   if (src0->intrinsic != nir_intrinsic_load_front_face)
+      return false;
+
+   nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+   if (!value1 || fabsf(value1->f[0]) != 1.0f)
+      return false;
+
+   nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
+   if (!value2 || fabsf(value2->f[0]) != 1.0f)
+      return false;
+
+   fs_reg tmp = vgrf(glsl_type::int_type);
+
+   if (brw->gen >= 6) {
+      /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
+      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
+       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
+       *
+       * This negation looks like it's safe in practice, because bits 0:4 will
+       * surely be TRIANGLES
+       */
+
+      if (value1->f[0] == -1.0f) {
+         g0.negate = true;
+      }
+
+      tmp.type = BRW_REGISTER_TYPE_W;
+      tmp.subreg_offset = 2;
+      tmp.stride = 2;
+
+      fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
+      or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
+
+      tmp.type = BRW_REGISTER_TYPE_D;
+      tmp.subreg_offset = 0;
+      tmp.stride = 1;
+   } else {
+      /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
+      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
+       *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
+       *
+       * This negation looks like it's safe in practice, because bits 0:4 will
+       * surely be TRIANGLES
+       */
+
+      if (value1->f[0] == -1.0f) {
+         g1_6.negate = true;
+      }
+
+      emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
+   }
+   emit(AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000)));
+
+   return true;
+}
+
 void
 fs_visitor::nir_emit_alu(nir_alu_instr *instr)
 {
    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
    fs_inst *inst;
 
-   fs_reg op[3];
    fs_reg result = get_nir_dest(instr->dest.dest);
    result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type);
 
-   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-      op[i] = get_nir_alu_src(instr, i);
+   fs_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      op[i] = get_nir_src(instr->src[i].src);
+      op[i].type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[i]);
+      op[i].abs = instr->src[i].abs;
+      op[i].negate = instr->src[i].negate;
+   }
+
+   /* We get a bunch of mov's out of the from_ssa pass and they may still
+    * be vectorized.  We'll handle them as a special-case.  We'll also
+    * handle vecN here because it's basically the same thing.
+    */
+   switch (instr->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4: {
+      fs_reg temp = result;
+      bool need_extra_copy = false;
+      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+         if (!instr->src[i].src.is_ssa &&
+             instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
+            need_extra_copy = true;
+            temp = retype(vgrf(4), result.type);
+            break;
+         }
+      }
+
+      for (unsigned i = 0; i < 4; i++) {
+         if (!(instr->dest.write_mask & (1 << i)))
+            continue;
+
+         if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
+            inst = emit(MOV(offset(temp, i),
+                        offset(op[0], instr->src[0].swizzle[i])));
+         } else {
+            inst = emit(MOV(offset(temp, i),
+                        offset(op[i], instr->src[i].swizzle[0])));
+         }
+         inst->saturate = instr->dest.saturate;
+      }
+
+      /* In this case the source and destination registers were the same,
+       * so we need to insert an extra set of moves in order to deal with
+       * any swizzling.
+       */
+      if (need_extra_copy) {
+         for (unsigned i = 0; i < 4; i++) {
+            if (!(instr->dest.write_mask & (1 << i)))
+               continue;
 
+            emit(MOV(offset(result, i), offset(temp, i)));
+         }
+      }
+      return;
+   }
+   default:
+      break;
+   }
+
+   /* At this point, we have dealt with any instruction that operates on
+    * more than a single channel.  Therefore, we can just adjust the source
+    * and destination registers for that channel and emit the instruction.
+    */
+   unsigned channel = 0;
    if (nir_op_infos[instr->op].output_size == 0) {
-      /* We've already scalarized, so we know that we only have one
-       * channel.  The only question is which channel.
+      /* Since NIR is doing the scalarizing for us, we should only ever see
+       * vectorized operations with a single channel.
        */
       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
-      unsigned off = ffs(instr->dest.write_mask) - 1;
-      result = offset(result, off);
+      channel = ffs(instr->dest.write_mask) - 1;
+
+      result = offset(result, channel);
+   }
 
-      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
-         op[i] = offset(op[i], off);
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
+      op[i] = offset(op[i], instr->src[i].swizzle[channel]);
    }
 
    switch (instr->op) {
-   case nir_op_fmov:
    case nir_op_i2f:
    case nir_op_u2f:
       inst = emit(MOV(result, op[0]));
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_imov:
    case nir_op_f2i:
    case nir_op_f2u:
       emit(MOV(result, op[0]));
@@ -686,7 +911,29 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_imul: {
-      /* TODO put in the 16-bit constant optimization once we have SSA */
+      if (brw->gen >= 8) {
+         emit(MUL(result, op[0], op[1]));
+         break;
+      } else {
+         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+         if (value0 && value0->u[0] < (1 << 16)) {
+            if (brw->gen < 7) {
+               emit(MUL(result, op[0], op[1]));
+            } else {
+               emit(MUL(result, op[1], op[0]));
+            }
+            break;
+         } else if (value1 && value1->u[0] < (1 << 16)) {
+            if (brw->gen < 7) {
+               emit(MUL(result, op[1], op[0]));
+            } else {
+               emit(MUL(result, op[0], op[1]));
+            }
+            break;
+         }
+      }
 
       if (brw->gen >= 7)
          no16("SIMD16 explicit accumulator operands unsupported\n");
@@ -767,15 +1014,30 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_inot:
+      if (brw->gen >= 8) {
+         resolve_source_modifiers(&op[0]);
+      }
       emit(NOT(result, op[0]));
       break;
    case nir_op_ixor:
+      if (brw->gen >= 8) {
+         resolve_source_modifiers(&op[0]);
+         resolve_source_modifiers(&op[1]);
+      }
       emit(XOR(result, op[0], op[1]));
       break;
    case nir_op_ior:
+      if (brw->gen >= 8) {
+         resolve_source_modifiers(&op[0]);
+         resolve_source_modifiers(&op[1]);
+      }
       emit(OR(result, op[0], op[1]));
       break;
    case nir_op_iand:
+      if (brw->gen >= 8) {
+         resolve_source_modifiers(&op[0]);
+         resolve_source_modifiers(&op[1]);
+      }
       emit(AND(result, op[0], op[1]));
       break;
 
@@ -820,11 +1082,6 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_fnoise4_4:
       unreachable("not reached: should be handled by lower_noise");
 
-   case nir_op_vec2:
-   case nir_op_vec3:
-   case nir_op_vec4:
-      unreachable("not reached: should be handled by lower_quadop_vector");
-
    case nir_op_ldexp:
       unreachable("not reached: should be handled by ldexp_to_arith()");
 
@@ -888,6 +1145,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       } else {
          emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_L));
          inst = emit(SEL(result, op[0], op[1]));
+         inst->predicate = BRW_PREDICATE_NORMAL;
       }
       inst->saturate = instr->dest.saturate;
       break;
@@ -901,6 +1159,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       } else {
          emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_GE));
          inst = emit(SEL(result, op[0], op[1]));
+         inst->predicate = BRW_PREDICATE_NORMAL;
       }
       inst->saturate = instr->dest.saturate;
       break;
@@ -927,7 +1186,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_fpow:
-      inst = emit(SHADER_OPCODE_POW, result, op[0], op[1]);
+      inst = emit_math(SHADER_OPCODE_POW, result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -990,15 +1249,19 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_ffma:
-      emit(MAD(result, op[2], op[1], op[0]));
+      inst = emit(MAD(result, op[2], op[1], op[0]));
+      inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_flrp:
-      /* TODO emulate for gen < 6 */
-      emit(LRP(result, op[2], op[1], op[0]));
+      inst = emit_lrp(result, op[0], op[1], op[2]);
+      inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_bcsel:
+      if (optimize_frontfacing_ternary(instr, result))
+         return;
+
       emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
       inst = emit(SEL(result, op[1], op[2]));
       inst->predicate = BRW_PREDICATE_NORMAL;
@@ -1007,6 +1270,17 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    default:
       unreachable("unhandled instruction");
    }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (brw->gen <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      fs_reg masked = vgrf(glsl_type::int_type);
+      emit(AND(masked, result, fs_reg(1)));
+      masked.negate = true;
+      emit(MOV(retype(result, BRW_REGISTER_TYPE_D), masked));
+   }
 }
 
 fs_reg
@@ -1044,46 +1318,6 @@ fs_visitor::get_nir_src(nir_src src)
    }
 }
 
-fs_reg
-fs_visitor::get_nir_alu_src(nir_alu_instr *instr, unsigned src)
-{
-   fs_reg reg = get_nir_src(instr->src[src].src);
-
-   reg.type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[src]);
-   reg.abs = instr->src[src].abs;
-   reg.negate = instr->src[src].negate;
-
-   bool needs_swizzle = false;
-   unsigned num_components = 0;
-   for (unsigned i = 0; i < 4; i++) {
-      if (!nir_alu_instr_channel_used(instr, src, i))
-         continue;
-
-      if (instr->src[src].swizzle[i] != i)
-         needs_swizzle = true;
-
-      num_components = i + 1;
-   }
-
-   if (needs_swizzle) {
-      /* resolve the swizzle through MOV's */
-      fs_reg new_reg = vgrf(num_components);
-      new_reg.type = reg.type;
-
-      for (unsigned i = 0; i < 4; i++) {
-         if (!nir_alu_instr_channel_used(instr, src, i))
-            continue;
-
-         emit(MOV(offset(new_reg, i),
-                  offset(reg, instr->src[src].swizzle[i])));
-      }
-
-      return new_reg;
-   }
-
-   return reg;
-}
-
 fs_reg
 fs_visitor::get_nir_dest(nir_dest dest)
 {
@@ -1130,32 +1364,28 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    bool has_indirect = false;
 
    switch (instr->intrinsic) {
-   case nir_intrinsic_discard: {
+   case nir_intrinsic_discard:
+   case nir_intrinsic_discard_if: {
       /* We track our discarded pixels in f0.1.  By predicating on it, we can
-       * update just the flag bits that aren't yet discarded.  By emitting a
-       * CMP of g0 != g0, all our currently executing channels will get turned
-       * off.
+       * update just the flag bits that aren't yet discarded.  If there's no
+       * condition, we emit a CMP of g0 != g0, so all currently executing
+       * channels will get turned off.
        */
-      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
-                                    BRW_REGISTER_TYPE_UW));
-      fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
-                              BRW_CONDITIONAL_NZ));
+      fs_inst *cmp;
+      if (instr->intrinsic == nir_intrinsic_discard_if) {
+         cmp = emit(CMP(reg_null_f, get_nir_src(instr->src[0]),
+                        fs_reg(0), BRW_CONDITIONAL_Z));
+      } else {
+         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UW));
+         cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
+      }
       cmp->predicate = BRW_PREDICATE_NORMAL;
       cmp->flag_subreg = 1;
 
       if (brw->gen >= 6) {
-         /* For performance, after a discard, jump to the end of the shader.
-         * Only jump if all relevant channels have been discarded.
-         */
-         fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
-         discard_jump->flag_subreg = 1;
-
-         discard_jump->predicate = (dispatch_width == 8)
-                                 ? BRW_PREDICATE_ALIGN1_ANY8H
-                                 : BRW_PREDICATE_ALIGN1_ANY16H;
-         discard_jump->predicate_inverse = true;
+         emit_discard_jump();
       }
-
       break;
    }
 
@@ -1185,7 +1415,36 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 
    case nir_intrinsic_load_front_face:
-      assert(!"TODO");
+      emit(MOV(retype(dest, BRW_REGISTER_TYPE_D),
+               *emit_frontfacing_interpolation()));
+      break;
+
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base: {
+      fs_reg vertex_id = nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+      assert(vertex_id.file != BAD_FILE);
+      dest.type = vertex_id.type;
+      emit(MOV(dest, vertex_id));
+      break;
+   }
+
+   case nir_intrinsic_load_base_vertex: {
+      fs_reg base_vertex = nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+      assert(base_vertex.file != BAD_FILE);
+      dest.type = base_vertex.type;
+      emit(MOV(dest, base_vertex));
+      break;
+   }
+
+   case nir_intrinsic_load_instance_id: {
+      fs_reg instance_id = nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+      assert(instance_id.file != BAD_FILE);
+      dest.type = instance_id.type;
+      emit(MOV(dest, instance_id));
+      break;
+   }
 
    case nir_intrinsic_load_sample_mask_in: {
       fs_reg sample_mask_in = nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
@@ -1215,11 +1474,19 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_load_uniform_indirect:
       has_indirect = true;
    case nir_intrinsic_load_uniform: {
-      unsigned index = 0;
+      unsigned index = instr->const_index[0];
+
+      fs_reg uniform_reg;
+      if (index < num_direct_uniforms) {
+         uniform_reg = fs_reg(UNIFORM, 0);
+      } else {
+         uniform_reg = fs_reg(UNIFORM, num_direct_uniforms);
+         index -= num_direct_uniforms;
+      }
+
       for (int i = 0; i < instr->const_index[1]; i++) {
          for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg src = offset(retype(nir_uniforms, dest.type),
-                                instr->const_index[0] + index);
+            fs_reg src = offset(retype(uniform_reg, dest.type), index);
             if (has_indirect)
                src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
             index++;
@@ -1233,6 +1500,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_load_ubo_indirect:
       has_indirect = true;
+      /* fallthrough */
    case nir_intrinsic_load_ubo: {
       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
       fs_reg surf_index;
@@ -1294,6 +1562,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_load_input_indirect:
       has_indirect = true;
+      /* fallthrough */
    case nir_intrinsic_load_input: {
       unsigned index = 0;
       for (int i = 0; i < instr->const_index[1]; i++) {
@@ -1456,7 +1725,6 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 void
 fs_visitor::nir_emit_texture(nir_tex_instr *instr)
 {
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    unsigned sampler = instr->sampler_index;
    fs_reg sampler_reg(sampler);
 
@@ -1473,7 +1741,7 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
    bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
                         instr->is_array;
 
-   int lod_components, offset_components = 0;
+   int lod_components = 0, offset_components = 0;
 
    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, offset;
 
@@ -1553,10 +1821,12 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
    }
 
    if (instr->op == nir_texop_txf_ms) {
-      if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
+      if (brw->gen >= 7 &&
+          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
          mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
-      else
+      } else {
          mcs = fs_reg(0u);
+      }
    }
 
    for (unsigned i = 0; i < 3; i++) {
@@ -1604,7 +1874,7 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
 
    emit_texture(op, dest_type, coordinate, instr->coord_components,
                 shadow_comparitor, lod, lod2, lod_components, sample_index,
-                offset, offset_components, mcs, gather_component,
+                offset, mcs, gather_component,
                 is_cube_array, is_rect, sampler, sampler_reg, texunit);
 
    fs_reg dest = get_nir_dest(instr->dest);