intel/compiler: Heap-allocate temporary storage

[mesa.git] / src / intel / compiler / brw_vec4_nir.cpp
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp

index 64371a16de57c3b7b59669e56ff61a8df735c092..f8d4dfb46829a31b6cc8add82ab06b58d367f832 100644 (file)
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -37,89 +37,7 @@ vec4_visitor::emit_nir_code()
     if (nir->num_uniforms > 0)
        nir_setup_uniforms();
  
-   nir_setup_system_values();
-
-   /* get the main function and emit it */
-   nir_foreach_function(function, nir) {
-      assert(strcmp(function->name, "main") == 0);
-      assert(function->impl);
-      nir_emit_impl(function->impl);
-   }
-}
-
-void
-vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
-{
-   dst_reg *reg;
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_load_vertex_id:
-      unreachable("should be lowered by lower_vertex_id().");
-
-   case nir_intrinsic_load_vertex_id_zero_base:
-      reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
-      break;
-
-   case nir_intrinsic_load_base_vertex:
-      reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX);
-      break;
-
-   case nir_intrinsic_load_instance_id:
-      reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID);
-      break;
-
-   case nir_intrinsic_load_base_instance:
-      reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE);
-      break;
-
-   case nir_intrinsic_load_draw_id:
-      reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID];
-      if (reg->file == BAD_FILE)
-         *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID);
-      break;
-
-   default:
-      break;
-   }
-}
-
-static bool
-setup_system_values_block(nir_block *block, vec4_visitor *v)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-      v->nir_setup_system_value_intrinsic(intrin);
-   }
-
-   return true;
-}
-
-void
-vec4_visitor::nir_setup_system_values()
-{
-   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
-   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
-      nir_system_values[i] = dst_reg();
-   }
-
-   nir_foreach_function(function, nir) {
-      assert(strcmp(function->name, "main") == 0);
-      assert(function->impl);
-      nir_foreach_block(block, function->impl) {
-         setup_system_values_block(block, this);
-      }
-   }
+   nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
  }
  
  void
@@ -246,8 +164,7 @@ vec4_visitor::nir_emit_instr(nir_instr *instr)
        break;
  
     default:
-      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
-      break;
+      unreachable("VS instruction not yet implemented by NIR->vec4");
     }
  }
  
@@ -335,24 +252,79 @@ vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
     return get_nir_src(src, nir_type_int32, num_components);
  }
  
+src_reg
+vec4_visitor::get_nir_src_imm(const nir_src &src)
+{
+   assert(nir_src_num_components(src) == 1);
+   assert(nir_src_bit_size(src) == 32);
+   return nir_src_is_const(src) ? src_reg(brw_imm_d(nir_src_as_int(src))) :
+                                  get_nir_src(src, 1);
+}
+
  src_reg
  vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
  {
     nir_src *offset_src = nir_get_io_offset_src(instr);
-   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
  
-   if (const_value) {
+   if (nir_src_is_const(*offset_src)) {
        /* The only constant offset we should find is 0.  brw_nir.c's
         * add_const_offset_to_base() will fold other constant offsets
         * into instr->const_index[0].
         */
-      assert(const_value->u32[0] == 0);
+      assert(nir_src_as_uint(*offset_src) == 0);
        return src_reg();
     }
  
     return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
  }
  
+static src_reg
+setup_imm_df(const vec4_builder &bld, double v)
+{
+   const gen_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->gen >= 7);
+
+   if (devinfo->gen >= 8)
+      return brw_imm_df(v);
+
+   /* gen7.5 does not support DF immediates straighforward but the DIM
+    * instruction allows to set the 64-bit immediate value.
+    */
+   if (devinfo->is_haswell) {
+      const vec4_builder ubld = bld.exec_all();
+      const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_DF);
+      ubld.DIM(dst, brw_imm_df(v));
+      return swizzle(src_reg(dst), BRW_SWIZZLE_XXXX);
+   }
+
+   /* gen7 does not support DF immediates */
+   union {
+      double d;
+      struct {
+         uint32_t i1;
+         uint32_t i2;
+      };
+   } di;
+
+   di.d = v;
+
+   /* Write the low 32-bit of the constant to the X:UD channel and the
+    * high 32-bit to the Y:UD channel to build the constant in a VGRF.
+    * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
+    * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
+    * XXXX so any access to the VGRF only reads the constant data in these
+    * channels.
+    */
+   const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+   for (unsigned n = 0; n < 2; n++) {
+      const vec4_builder ubld = bld.exec_all().group(4, n);
+      ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1));
+      ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2));
+   }
+
+   return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
+}
+
  void
  vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
  {
@@ -366,6 +338,7 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
        reg.type = BRW_REGISTER_TYPE_D;
     }
  
+   const vec4_builder ibld = vec4_builder(this).at_end();
     unsigned remaining = brw_writemask_for_size(instr->def.num_components);
  
     /* @FIXME: consider emitting vector operations to save some MOVs in
@@ -389,7 +362,7 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
  
        reg.writemask = writemask;
        if (instr->def.bit_size == 64) {
-         emit(MOV(reg, setup_imm_df(instr->value.f64[i])));
+         emit(MOV(reg, setup_imm_df(ibld, instr->value.f64[i])));
        } else {
           emit(MOV(reg, brw_imm_d(instr->value.i32[i])));
        }
@@ -403,6 +376,32 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
     nir_ssa_values[instr->def.index] = reg;
  }
  
+src_reg
+vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr)
+{
+   /* SSBO stores are weird in that their index is in src[1] */
+   const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
+
+   src_reg surf_index;
+   if (nir_src_is_const(instr->src[src])) {
+      unsigned index = prog_data->base.binding_table.ssbo_start +
+                       nir_src_as_uint(instr->src[src]);
+      surf_index = brw_imm_ud(index);
+      brw_mark_surface_used(&prog_data->base, index);
+   } else {
+      surf_index = src_reg(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[src], 1),
+               brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+      surf_index = emit_uniformize(surf_index);
+
+      brw_mark_surface_used(&prog_data->base,
+                            prog_data->base.binding_table.ssbo_start +
+                            nir->info.num_ssbos - 1);
+   }
+
+   return surf_index;
+}
+
  void
  vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
  {
@@ -412,15 +411,13 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     switch (instr->intrinsic) {
  
     case nir_intrinsic_load_input: {
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-
        /* We set EmitNoIndirectInput for VS */
-      assert(const_offset);
+      unsigned load_offset = nir_src_as_uint(instr->src[0]);
  
        dest = get_nir_dest(instr->dest);
        dest.writemask = brw_writemask_for_size(instr->num_components);
  
-      src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0],
+      src = src_reg(ATTR, instr->const_index[0] + load_offset,
                      glsl_type::uvec4_type);
        src = retype(src, dest.type);
  
@@ -439,10 +436,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     }
  
     case nir_intrinsic_store_output: {
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
-      assert(const_offset);
-
-      int varying = instr->const_index[0] + const_offset->u32[0];
+      unsigned store_offset = nir_src_as_uint(instr->src[1]);
+      int varying = instr->const_index[0] + store_offset;
  
        bool is_64bit = nir_src_bit_size(instr->src[0]) == 64;
        if (is_64bit) {
@@ -477,14 +472,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     }
  
     case nir_intrinsic_get_buffer_size: {
-      nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
+      unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
+                            nir_src_as_uint(instr->src[0]) : 0;
  
        const unsigned index =
           prog_data->base.binding_table.ssbo_start + ssbo_index;
        dst_reg result_dst = get_nir_dest(instr->dest);
        vec4_instruction *inst = new(mem_ctx)
-         vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
+         vec4_instruction(SHADER_OPCODE_GET_BUFFER_SIZE, result_dst);
  
        inst->base_mrf = 2;
        inst->mlen = 1; /* always at least one */
@@ -505,41 +500,18 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     case nir_intrinsic_store_ssbo: {
        assert(devinfo->gen >= 7);
  
-      /* Block index */
-      src_reg surf_index;
-      nir_const_value *const_uniform_block =
-         nir_src_as_const_value(instr->src[1]);
-      if (const_uniform_block) {
-         unsigned index = prog_data->base.binding_table.ssbo_start +
-                          const_uniform_block->u32[0];
-         surf_index = brw_imm_ud(index);
-         brw_mark_surface_used(&prog_data->base, index);
-      } else {
-         surf_index = src_reg(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1),
-                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
-         surf_index = emit_uniformize(surf_index);
-
-         brw_mark_surface_used(&prog_data->base,
-                               prog_data->base.binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
-      }
+      /* brw_nir_lower_mem_access_bit_sizes takes care of this */
+      assert(nir_src_bit_size(instr->src[0]) == 32);
+      assert(nir_intrinsic_write_mask(instr) ==
+             (1u << instr->num_components) - 1);
  
-      /* Offset */
-      src_reg offset_reg;
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
-      if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u32[0]);
-      } else {
-         offset_reg = get_nir_src(instr->src[2], 1);
-      }
+      src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
+      src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
+                                  BRW_REGISTER_TYPE_UD);
  
        /* Value */
        src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
  
-      /* Writemask */
-      unsigned write_mask = instr->const_index[0];
-
        /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
         * writes will use SIMD8 mode. In order to hide this and keep symmetry across
         * typed and untyped messages and across hardware platforms, the
@@ -581,167 +553,50 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        const vec4_builder bld = vec4_builder(this).at_end()
                                 .annotate(current_annotation, base_ir);
  
-      unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32;
-      if (type_slots == 2) {
-         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
-         shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true);
-         val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F));
-      }
-
-      uint8_t swizzle[4] = { 0, 0, 0, 0};
-      int num_channels = 0;
-      unsigned skipped_channels = 0;
-      int num_components = instr->num_components;
-      for (int i = 0; i < num_components; i++) {
-         /* Read components Z/W of a dvec from the appropriate place. We will
-          * also have to adjust the swizzle (we do that with the '% 4' below)
-          */
-         if (i == 2 && type_slots == 2)
-            val_reg = byte_offset(val_reg, REG_SIZE);
-
-         /* Check if this channel needs to be written. If so, record the
-          * channel we need to take the data from in the swizzle array
-          */
-         int component_mask = 1 << i;
-         int write_test = write_mask & component_mask;
-         if (write_test) {
-            /* If we are writing doubles we have to write 2 channels worth of
-             * of data (64 bits) for each double component.
-             */
-            swizzle[num_channels++] = (i * type_slots) % 4;
-            if (type_slots == 2)
-               swizzle[num_channels++] = (i * type_slots + 1) % 4;
-         }
-
-         /* If we don't have to write this channel it means we have a gap in the
-          * vector, so write the channels we accumulated until now, if any. Do
-          * the same if this was the last component in the vector, if we have
-          * enough channels for a full vec4 write or if we have processed
-          * components XY of a dvec (since components ZW are not in the same
-          * SIMD register)
-          */
-         if (!write_test || i == num_components - 1 || num_channels == 4 ||
-             (i == 1 && type_slots == 2)) {
-            if (num_channels > 0) {
-               /* We have channels to write, so update the offset we need to
-                * write at to skip the channels we skipped, if any.
-                */
-               if (skipped_channels > 0) {
-                  if (offset_reg.file == IMM) {
-                     offset_reg.ud += 4 * skipped_channels;
-                  } else {
-                     emit(ADD(dst_reg(offset_reg), offset_reg,
-                              brw_imm_ud(4 * skipped_channels)));
-                  }
-               }
-
-               /* Swizzle the data register so we take the data from the channels
-                * we need to write and send the write message. This will write
-                * num_channels consecutive dwords starting at offset.
-                */
-               val_reg.swizzle =
-                  BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-               emit_untyped_write(bld, surf_index, offset_reg, val_reg,
-                                  1 /* dims */, num_channels /* size */,
-                                  BRW_PREDICATE_NONE);
-
-               /* If we have to do a second write we will have to update the
-                * offset so that we jump over the channels we have just written
-                * now.
-                */
-               skipped_channels = num_channels;
-
-               /* Restart the count for the next write message */
-               num_channels = 0;
-            }
-
-            /* If we didn't write the channel, increase skipped count */
-            if (!write_test)
-               skipped_channels += type_slots;
-         }
-      }
-
+      emit_untyped_write(bld, surf_index, offset_reg, val_reg,
+                         1 /* dims */, instr->num_components /* size */,
+                         BRW_PREDICATE_NONE);
        break;
     }
  
     case nir_intrinsic_load_ssbo: {
        assert(devinfo->gen >= 7);
  
-      nir_const_value *const_uniform_block =
-         nir_src_as_const_value(instr->src[0]);
-
-      src_reg surf_index;
-      if (const_uniform_block) {
-         unsigned index = prog_data->base.binding_table.ssbo_start +
-                          const_uniform_block->u32[0];
-         surf_index = brw_imm_ud(index);
-
-         brw_mark_surface_used(&prog_data->base, index);
-      } else {
-         surf_index = src_reg(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1),
-                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
-         surf_index = emit_uniformize(surf_index);
-
-         /* Assume this may touch any UBO. It would be nice to provide
-          * a tighter bound, but the array information is already lowered away.
-          */
-         brw_mark_surface_used(&prog_data->base,
-                               prog_data->base.binding_table.ssbo_start +
-                               nir->info->num_ssbos - 1);
-      }
+      /* brw_nir_lower_mem_access_bit_sizes takes care of this */
+      assert(nir_dest_bit_size(instr->dest) == 32);
  
-      src_reg offset_reg;
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
-      if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u32[0]);
-      } else {
-         offset_reg = get_nir_src(instr->src[1], 1);
-      }
+      src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
+      src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
+                                  BRW_REGISTER_TYPE_UD);
  
        /* Read the vector */
        const vec4_builder bld = vec4_builder(this).at_end()
           .annotate(current_annotation, base_ir);
  
-      src_reg read_result;
+      src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                              1 /* dims */, 4 /* size*/,
+                                              BRW_PREDICATE_NONE);
        dst_reg dest = get_nir_dest(instr->dest);
-      if (type_sz(dest.type) < 8) {
-         read_result = emit_untyped_read(bld, surf_index, offset_reg,
-                                         1 /* dims */, 4 /* size*/,
-                                         BRW_PREDICATE_NONE);
-      } else {
-         src_reg shuffled = src_reg(this, glsl_type::dvec4_type);
-
-         src_reg temp;
-         temp = emit_untyped_read(bld, surf_index, offset_reg,
-                                  1 /* dims */, 4 /* size*/,
-                                  BRW_PREDICATE_NONE);
-         emit(MOV(dst_reg(retype(shuffled, temp.type)), temp));
-
-         if (offset_reg.file == IMM)
-            offset_reg.ud += 16;
-         else
-            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16)));
-
-         temp = emit_untyped_read(bld, surf_index, offset_reg,
-                                  1 /* dims */, 4 /* size*/,
-                                  BRW_PREDICATE_NONE);
-         emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)),
-                  temp));
-
-         read_result = src_reg(this, glsl_type::dvec4_type);
-         shuffle_64bit_data(dst_reg(read_result), shuffled, false);
-      }
-
        read_result.type = dest.type;
        read_result.swizzle = brw_swizzle_for_size(instr->num_components);
        emit(MOV(dest, read_result));
        break;
     }
  
-   case nir_intrinsic_ssbo_atomic_add:
-      nir_emit_ssbo_atomic(BRW_AOP_ADD, instr);
+   case nir_intrinsic_ssbo_atomic_add: {
+      int op = BRW_AOP_ADD;
+
+      if (nir_src_is_const(instr->src[2])) {
+         int add_val = nir_src_as_int(instr->src[2]);
+         if (add_val == 1)
+            op = BRW_AOP_INC;
+         else if (add_val == -1)
+            op = BRW_AOP_DEC;
+      }
+
+      nir_emit_ssbo_atomic(op, instr);
        break;
+   }
     case nir_intrinsic_ssbo_atomic_imin:
        nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr);
        break;
@@ -778,14 +633,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     case nir_intrinsic_load_instance_id:
     case nir_intrinsic_load_base_instance:
     case nir_intrinsic_load_draw_id:
-   case nir_intrinsic_load_invocation_id: {
-      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
-      src_reg val = src_reg(nir_system_values[sv]);
-      assert(val.file != BAD_FILE);
-      dest = get_nir_dest(instr->dest, val.type);
-      emit(MOV(dest, val));
-      break;
-   }
+   case nir_intrinsic_load_invocation_id:
+      unreachable("should be lowered by brw_nir_lower_vs_inputs()");
  
     case nir_intrinsic_load_uniform: {
        /* Offsets are in bytes but they should always be multiples of 4 */
@@ -804,22 +653,29 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
         * The swizzle also works in the indirect case as the generator adds
         * the swizzle to the offset for us.
         */
-      unsigned shift = (nir_intrinsic_base(instr) % 16) / 4;
+      const int type_size = type_sz(src.type);
+      unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size;
        assert(shift + instr->num_components <= 4);
  
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-      if (const_offset) {
+      if (nir_src_is_const(instr->src[0])) {
+         const unsigned load_offset = nir_src_as_uint(instr->src[0]);
           /* Offsets are in bytes but they should always be multiples of 4 */
-         assert(const_offset->u32[0] % 4 == 0);
+         assert(load_offset % 4 == 0);
  
-         unsigned offset = const_offset->u32[0] + shift * 4;
+         src.swizzle = brw_swizzle_for_size(instr->num_components);
+         dest.writemask = brw_writemask_for_size(instr->num_components);
+         unsigned offset = load_offset + shift * type_size;
           src.offset = ROUND_DOWN_TO(offset, 16);
-         shift = (offset % 16) / 4;
+         shift = (offset % 16) / type_size;
+         assert(shift + instr->num_components <= 4);
           src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
  
           emit(MOV(dest, src));
        } else {
-         src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+         /* Uniform arrays are vec4 aligned, because of std140 alignment
+          * rules.
+          */
+         assert(shift == 0);
  
           src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
  
@@ -832,56 +688,17 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        break;
     }
  
-   case nir_intrinsic_atomic_counter_read:
-   case nir_intrinsic_atomic_counter_inc:
-   case nir_intrinsic_atomic_counter_dec: {
-      unsigned surf_index = prog_data->base.binding_table.abo_start +
-         (unsigned) instr->const_index[0];
-      const vec4_builder bld =
-         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
-
-      /* Get some metadata from the image intrinsic. */
-      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-
-      /* Get the arguments of the atomic intrinsic. */
-      src_reg offset = get_nir_src(instr->src[0], nir_type_int32,
-                                   instr->num_components);
-      const src_reg surface = brw_imm_ud(surf_index);
-      const src_reg src0 = (info->num_srcs >= 2
-                           ? get_nir_src(instr->src[1]) : src_reg());
-      const src_reg src1 = (info->num_srcs >= 3
-                           ? get_nir_src(instr->src[2]) : src_reg());
-
-      src_reg tmp;
-
-      dest = get_nir_dest(instr->dest);
-
-      if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
-         tmp = emit_untyped_read(bld, surface, offset, 1, 1);
-      } else {
-         tmp = emit_untyped_atomic(bld, surface, offset,
-                                   src0, src1,
-                                   1, 1,
-                                   get_atomic_counter_op(instr->intrinsic));
-      }
-
-      bld.MOV(retype(dest, tmp.type), tmp);
-      brw_mark_surface_used(stage_prog_data, surf_index);
-      break;
-   }
-
     case nir_intrinsic_load_ubo: {
-      nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
        src_reg surf_index;
  
        dest = get_nir_dest(instr->dest);
  
-      if (const_block_index) {
+      if (nir_src_is_const(instr->src[0])) {
           /* The block index is a constant, so just emit the binding table entry
            * as an immediate.
            */
           const unsigned index = prog_data->base.binding_table.ubo_start +
-                                const_block_index->u32[0];
+                                nir_src_as_uint(instr->src[0]);
           surf_index = brw_imm_ud(index);
           brw_mark_surface_used(&prog_data->base, index);
        } else {
@@ -900,15 +717,17 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
            */
           brw_mark_surface_used(&prog_data->base,
                                 prog_data->base.binding_table.ubo_start +
-                               nir->info->num_ubos - 1);
+                               nir->info.num_ubos - 1);
        }
  
        src_reg offset_reg;
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
-      if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
+      if (nir_src_is_const(instr->src[1])) {
+         unsigned load_offset = nir_src_as_uint(instr->src[1]);
+         offset_reg = brw_imm_ud(load_offset & ~15);
        } else {
-         offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1);
+         offset_reg = src_reg(this, glsl_type::uint_type);
+         emit(MOV(dst_reg(offset_reg),
+                  get_nir_src(instr->src[1], nir_type_uint32, 1)));
        }
  
        src_reg packed_consts;
@@ -936,13 +755,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        }
  
        packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
-      if (const_offset) {
+      if (nir_src_is_const(instr->src[1])) {
+         unsigned load_offset = nir_src_as_uint(instr->src[1]);
           unsigned type_size = type_sz(dest.type);
           packed_consts.swizzle +=
-            BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size,
-                         const_offset->u32[0] % 16 / type_size,
-                         const_offset->u32[0] % 16 / type_size,
-                         const_offset->u32[0] % 16 / type_size);
+            BRW_SWIZZLE4(load_offset % 16 / type_size,
+                         load_offset % 16 / type_size,
+                         load_offset % 16 / type_size,
+                         load_offset % 16 / type_size);
        }
  
        emit(MOV(dest, retype(packed_consts, dest.type)));
@@ -981,28 +801,11 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
        dest = get_nir_dest(instr->dest);
  
-   src_reg surface;
-   nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
-   if (const_surface) {
-      unsigned surf_index = prog_data->base.binding_table.ssbo_start +
-                            const_surface->u32[0];
-      surface = brw_imm_ud(surf_index);
-      brw_mark_surface_used(&prog_data->base, surf_index);
-   } else {
-      surface = src_reg(this, glsl_type::uint_type);
-      emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]),
-               brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
-
-      /* Assume this may touch any UBO. This is the same we do for other
-       * UBO/SSBO accesses with non-constant surface.
-       */
-      brw_mark_surface_used(&prog_data->base,
-                            prog_data->base.binding_table.ssbo_start +
-                            nir->info->num_ssbos - 1);
-   }
-
+   src_reg surface = get_nir_ssbo_intrinsic_index(instr);
     src_reg offset = get_nir_src(instr->src[1], 1);
-   src_reg data1 = get_nir_src(instr->src[2], 1);
+   src_reg data1;
+   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
+      data1 = get_nir_src(instr->src[2], 1);
     src_reg data2;
     if (op == BRW_AOP_CMPWR)
        data2 = get_nir_src(instr->src[3], 1);
@@ -1030,34 +833,34 @@ static enum brw_conditional_mod
  brw_conditional_for_nir_comparison(nir_op op)
  {
     switch (op) {
-   case nir_op_flt:
-   case nir_op_ilt:
-   case nir_op_ult:
+   case nir_op_flt32:
+   case nir_op_ilt32:
+   case nir_op_ult32:
        return BRW_CONDITIONAL_L;
  
-   case nir_op_fge:
-   case nir_op_ige:
-   case nir_op_uge:
+   case nir_op_fge32:
+   case nir_op_ige32:
+   case nir_op_uge32:
        return BRW_CONDITIONAL_GE;
  
-   case nir_op_feq:
-   case nir_op_ieq:
-   case nir_op_ball_fequal2:
-   case nir_op_ball_iequal2:
-   case nir_op_ball_fequal3:
-   case nir_op_ball_iequal3:
-   case nir_op_ball_fequal4:
-   case nir_op_ball_iequal4:
+   case nir_op_feq32:
+   case nir_op_ieq32:
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_fequal4:
+   case nir_op_b32all_iequal4:
        return BRW_CONDITIONAL_Z;
  
-   case nir_op_fne:
-   case nir_op_ine:
-   case nir_op_bany_fnequal2:
-   case nir_op_bany_inequal2:
-   case nir_op_bany_fnequal3:
-   case nir_op_bany_inequal3:
-   case nir_op_bany_fnequal4:
-   case nir_op_bany_inequal4:
+   case nir_op_fne32:
+   case nir_op_ine32:
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_fnequal4:
+   case nir_op_b32any_inequal4:
        return BRW_CONDITIONAL_NZ;
  
     default:
@@ -1077,20 +880,20 @@ vec4_visitor::optimize_predicate(nir_alu_instr *instr,
        nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
  
     switch (cmp_instr->op) {
-   case nir_op_bany_fnequal2:
-   case nir_op_bany_inequal2:
-   case nir_op_bany_fnequal3:
-   case nir_op_bany_inequal3:
-   case nir_op_bany_fnequal4:
-   case nir_op_bany_inequal4:
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_fnequal4:
+   case nir_op_b32any_inequal4:
        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
        break;
-   case nir_op_ball_fequal2:
-   case nir_op_ball_iequal2:
-   case nir_op_ball_fequal3:
-   case nir_op_ball_iequal3:
-   case nir_op_ball_fequal4:
-   case nir_op_ball_iequal4:
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_fequal4:
+   case nir_op_b32all_iequal4:
        *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
        break;
     default:
@@ -1183,16 +986,28 @@ vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src,
        return;
     }
  
+   enum opcode op;
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      op = VEC4_OPCODE_DOUBLE_TO_D32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      op = VEC4_OPCODE_DOUBLE_TO_U32;
+      break;
+   case BRW_REGISTER_TYPE_F:
+      op = VEC4_OPCODE_DOUBLE_TO_F32;
+      break;
+   default:
+      unreachable("Unknown conversion");
+   }
+
     dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
     emit(MOV(temp, src));
-
     dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type);
-   temp2 = retype(temp2, dst.type);
-   emit(VEC4_OPCODE_FROM_DOUBLE, temp2, src_reg(temp))
-      ->size_written = 2 * REG_SIZE;
+   emit(op, temp2, src_reg(temp));
  
-   emit(VEC4_OPCODE_PICK_LOW_32BIT, temp2, src_reg(retype(temp2, BRW_REGISTER_TYPE_DF)));
-   vec4_instruction *inst = emit(MOV(dst, src_reg(temp2)));
+   emit(VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2));
+   vec4_instruction *inst = emit(MOV(dst, src_reg(retype(temp2, dst.type))));
     inst->saturate = saturate;
  }
  
@@ -1208,53 +1023,6 @@ vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src,
     inst->saturate = saturate;
  }
  
-src_reg
-vec4_visitor::setup_imm_df(double v)
-{
-   assert(devinfo->gen >= 7);
-
-   if (devinfo->gen >= 8)
-      return brw_imm_df(v);
-
-   /* gen7.5 does not support DF immediates straighforward but the DIM
-    * instruction allows to set the 64-bit immediate value.
-    */
-   if (devinfo->is_haswell) {
-      dst_reg dst = retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_DF);
-      emit(DIM(dst, brw_imm_df(v)))->force_writemask_all = true;
-      return swizzle(src_reg(retype(dst, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
-   }
-
-   /* gen7 does not support DF immediates */
-   union {
-      double d;
-      struct {
-         uint32_t i1;
-         uint32_t i2;
-      };
-   } di;
-
-   di.d = v;
-
-   /* Write the low 32-bit of the constant to the X:UD channel and the
-    * high 32-bit to the Y:UD channel to build the constant in a VGRF.
-    * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
-    * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
-    * XXXX so any access to the VGRF only reads the constant data in these
-    * channels.
-    */
-   const dst_reg tmp =
-      retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_UD);
-   for (int n = 0; n < 2; n++) {
-      emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1)))
-         ->force_writemask_all = true;
-      emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2)))
-         ->force_writemask_all = true;
-   }
-
-   return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
-}
-
  void
  vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
  {
@@ -1317,6 +1085,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
+   case nir_op_uadd_sat:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      inst = emit(ADD(dst, op[0], op[1]));
+      inst->saturate = true;
+      break;
+
     case nir_op_fmul:
        inst = emit(MUL(dst, op[0], op[1]));
        inst->saturate = instr->dest.saturate;
@@ -1325,21 +1099,22 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
     case nir_op_imul: {
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
        if (devinfo->gen < 8) {
-         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
-         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-
           /* For integer multiplication, the MUL uses the low 16 bits of one of
            * the operands (src0 through SNB, src1 on IVB and later). The MACH
            * accumulates in the contribution of the upper 16 bits of that
            * operand. If we can determine that one of the args is in the low
            * 16 bits, though, we can just emit a single MUL.
            */
-         if (value0 && value0->u32[0] < (1 << 16)) {
+         if (nir_src_is_const(instr->src[0].src) &&
+             nir_alu_instr_src_read_mask(instr, 0) == 1 &&
+             nir_src_comp_as_uint(instr->src[0].src, 0) < (1 << 16)) {
              if (devinfo->gen < 7)
                 emit(MUL(dst, op[0], op[1]));
              else
                 emit(MUL(dst, op[1], op[0]));
-         } else if (value1 && value1->u32[0] < (1 << 16)) {
+         } else if (nir_src_is_const(instr->src[1].src) &&
+                    nir_alu_instr_src_read_mask(instr, 1) == 1 &&
+                    nir_src_comp_as_uint(instr->src[1].src, 0) < (1 << 16)) {
              if (devinfo->gen < 7)
                 emit(MUL(dst, op[1], op[0]));
              else
@@ -1565,18 +1340,18 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
     case nir_op_fddy_fine:
        unreachable("derivatives are not valid in vertex shaders");
  
-   case nir_op_ilt:
-   case nir_op_ult:
-   case nir_op_ige:
-   case nir_op_uge:
-   case nir_op_ieq:
-   case nir_op_ine:
+   case nir_op_ilt32:
+   case nir_op_ult32:
+   case nir_op_ige32:
+   case nir_op_uge32:
+   case nir_op_ieq32:
+   case nir_op_ine32:
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
        /* Fallthrough */
-   case nir_op_flt:
-   case nir_op_fge:
-   case nir_op_feq:
-   case nir_op_fne: {
+   case nir_op_flt32:
+   case nir_op_fge32:
+   case nir_op_feq32:
+   case nir_op_fne32: {
        enum brw_conditional_mod conditional_mod =
           brw_conditional_for_nir_comparison(instr->op);
  
@@ -1597,14 +1372,14 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        break;
     }
  
-   case nir_op_ball_iequal2:
-   case nir_op_ball_iequal3:
-   case nir_op_ball_iequal4:
+   case nir_op_b32all_iequal2:
+   case nir_op_b32all_iequal3:
+   case nir_op_b32all_iequal4:
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
        /* Fallthrough */
-   case nir_op_ball_fequal2:
-   case nir_op_ball_fequal3:
-   case nir_op_ball_fequal4: {
+   case nir_op_b32all_fequal2:
+   case nir_op_b32all_fequal3:
+   case nir_op_b32all_fequal4: {
        unsigned swiz =
           brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
  
@@ -1616,14 +1391,14 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        break;
     }
  
-   case nir_op_bany_inequal2:
-   case nir_op_bany_inequal3:
-   case nir_op_bany_inequal4:
+   case nir_op_b32any_inequal2:
+   case nir_op_b32any_inequal3:
+   case nir_op_b32any_inequal4:
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
        /* Fallthrough */
-   case nir_op_bany_fnequal2:
-   case nir_op_bany_fnequal3:
-   case nir_op_bany_fnequal4: {
+   case nir_op_b32any_fnequal2:
+   case nir_op_b32any_fnequal3:
+   case nir_op_b32any_fnequal4: {
        unsigned swiz =
           brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
  
@@ -1671,12 +1446,18 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        emit(AND(dst, op[0], op[1]));
        break;
  
-   case nir_op_b2i:
-   case nir_op_b2f:
-      emit(MOV(dst, negate(op[0])));
+   case nir_op_b2i32:
+   case nir_op_b2f32:
+   case nir_op_b2f64:
+      if (nir_dest_bit_size(instr->dest.dest) > 32) {
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         emit_conversion_to_double(dst, negate(op[0]), false);
+      } else {
+         emit(MOV(dst, negate(op[0])));
+      }
        break;
  
-   case nir_op_f2b:
+   case nir_op_f2b32:
        if (nir_src_bit_size(instr->src[0].src) == 64) {
           /* We use a MOV with conditional_mod to check if the provided value is
            * 0.0. We want this to flush denormalized numbers to zero, so we set a
@@ -1697,7 +1478,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        }
        break;
  
-   case nir_op_i2b:
+   case nir_op_i2b32:
        emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
        break;
  
@@ -1905,7 +1686,20 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        unreachable("not reached: should have been lowered");
  
     case nir_op_fsign:
-      if (type_sz(op[0].type) < 8) {
+      assert(!instr->dest.saturate);
+      if (op[0].abs) {
+         /* Straightforward since the source can be assumed to be either
+          * strictly >= 0 or strictly <= 0 depending on the setting of the
+          * negate flag.
+          */
+         inst = emit(MOV(dst, op[0]));
+         inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+         inst = (op[0].negate)
+            ? emit(MOV(dst, brw_imm_f(-1.0f)))
+            : emit(MOV(dst, brw_imm_f(1.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+       } else if (type_sz(op[0].type) < 8) {
           /* AND(val, 0x80000000) gives the sign bit.
            *
            * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
@@ -1920,11 +1714,6 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
           inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u)));
           inst->predicate = BRW_PREDICATE_NORMAL;
           dst.type = BRW_REGISTER_TYPE_F;
-
-         if (instr->dest.saturate) {
-            inst = emit(MOV(dst, src_reg(dst)));
-            inst->saturate = true;
-         }
        } else {
           /* For doubles we do the same but we need to consider:
            *
@@ -1957,7 +1746,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
           /* Now convert the result from float to double */
           emit_conversion_to_double(dst, retype(src_reg(tmp),
                                                 BRW_REGISTER_TYPE_F),
-                                   instr->dest.saturate);
+                                   false);
        }
        break;
  
@@ -2009,7 +1798,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
-   case nir_op_bcsel:
+   case nir_op_b32csel:
        enum brw_predicate predicate;
        if (!optimize_predicate(instr, &predicate)) {
           emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
@@ -2112,7 +1901,7 @@ vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
     }
  }
  
-enum ir_texture_opcode
+static enum ir_texture_opcode
  ir_texture_opcode_for_nir_texop(nir_texop texop)
  {
     enum ir_texture_opcode op;
@@ -2136,7 +1925,8 @@ ir_texture_opcode_for_nir_texop(nir_texop texop)
  
     return op;
  }
-const glsl_type *
+
+static const glsl_type *
  glsl_type_for_nir_alu_type(nir_alu_type alu_type,
                             unsigned components)
  {
@@ -2229,6 +2019,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
        case nir_tex_src_offset: {
           nir_const_value *const_offset =
              nir_src_as_const_value(instr->src[i].src);
+         assert(nir_src_bit_size(instr->src[i].src) == 32);
           if (!const_offset ||
               !brw_texture_offset(const_offset->i32,
                                   nir_tex_instr_src_size(instr, i),