radv: fix null memcpy and zero-sized malloc

[mesa.git] / src / amd / llvm / ac_nir_to_llvm.c
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c

index db3ed1436b744bdccc91d97082633c314e4c0b10..1b0bdffdb4576c7cbfec31d2dbe8afa209b77e75 100644 (file)
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -51,6 +51,7 @@ struct ac_nir_context {
         struct hash_table *defs;
         struct hash_table *phis;
         struct hash_table *vars;
+        struct hash_table *verified_interp;
  
         LLVMValueRef main_function;
         LLVMBasicBlockRef continue_block;
@@ -60,10 +61,16 @@ struct ac_nir_context {
         LLVMValueRef *locals;
  };
  
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+                                          nir_deref_instr *deref_instr,
+                                          const nir_instr *instr,
+                                          bool image);
+
  static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
                                      nir_deref_instr *deref_instr,
                                      enum ac_descriptor_type desc_type,
                                      const nir_instr *instr,
+                                    LLVMValueRef index,
                                      bool image, bool write);
  
  static void
@@ -163,6 +170,17 @@ static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
                                   LLVMIntPredicate pred, LLVMValueRef src0,
                                   LLVMValueRef src1)
  {
+       LLVMTypeRef src0_type = LLVMTypeOf(src0);
+       LLVMTypeRef src1_type = LLVMTypeOf(src1);
+
+       if (LLVMGetTypeKind(src0_type) == LLVMPointerTypeKind &&
+           LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) {
+               src1 = LLVMBuildIntToPtr(ctx->builder, src1, src0_type, "");
+       } else if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind &&
+                  LLVMGetTypeKind(src0_type) != LLVMPointerTypeKind) {
+               src0 = LLVMBuildIntToPtr(ctx->builder, src0, src1_type, "");
+       }
+
         LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
         return LLVMBuildSelect(ctx->builder, result,
                                LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
@@ -187,13 +205,13 @@ static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
                                          LLVMTypeRef result_type,
                                          LLVMValueRef src0)
  {
-       char name[64];
+       char name[64], type[64];
         LLVMValueRef params[] = {
                 ac_to_float(ctx, src0),
         };
  
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
         assert(length < sizeof(name));
         return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
  }
@@ -203,14 +221,14 @@ static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
                                        LLVMTypeRef result_type,
                                        LLVMValueRef src0, LLVMValueRef src1)
  {
-       char name[64];
+       char name[64], type[64];
         LLVMValueRef params[] = {
                 ac_to_float(ctx, src0),
                 ac_to_float(ctx, src1),
         };
  
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
         assert(length < sizeof(name));
         return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
  }
@@ -220,15 +238,15 @@ static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
                                          LLVMTypeRef result_type,
                                          LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
  {
-       char name[64];
+       char name[64], type[64];
         LLVMValueRef params[] = {
                 ac_to_float(ctx, src0),
                 ac_to_float(ctx, src1),
                 ac_to_float(ctx, src2),
         };
  
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
         assert(length < sizeof(name));
         return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
  }
@@ -490,12 +508,103 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
         return result;
  }
  
+struct waterfall_context {
+       LLVMBasicBlockRef phi_bb[2];
+       bool use_waterfall;
+};
+
+/* To deal with divergent descriptors we can create a loop that handles all
+ * lanes with the same descriptor on a given iteration (henceforth a
+ * waterfall loop).
+ *
+ * These helper create the begin and end of the loop leaving the caller
+ * to implement the body.
+ * 
+ * params:
+ *  - ctx is the usal nir context
+ *  - wctx is a temporary struct containing some loop info. Can be left uninitialized.
+ *  - value is the possibly divergent value for which we built the loop
+ *  - divergent is whether value is actually divergent. If false we just pass
+ *     things through.
+ */
+static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx,
+                                   struct waterfall_context *wctx,
+                                   LLVMValueRef value, bool divergent)
+{
+       /* If the app claims the value is divergent but it is constant we can
+        * end up with a dynamic index of NULL. */
+       if (!value)
+               divergent = false;
+
+       wctx->use_waterfall = divergent;
+       if (!divergent)
+               return value;
+
+       ac_build_bgnloop(&ctx->ac, 6000);
+
+       LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL);
+
+       LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value,
+                                           scalar_value, "uniform_active");
+
+       wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder);
+       ac_build_ifcc(&ctx->ac, active, 6001);
+
+       return scalar_value;
+}
+
+static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx,
+                                  struct waterfall_context *wctx,
+                                  LLVMValueRef value)
+{
+       LLVMValueRef ret = NULL;
+       LLVMValueRef phi_src[2];
+       LLVMValueRef cc_phi_src[2] = {
+               LLVMConstInt(ctx->ac.i32, 0, false),
+               LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
+       };
+
+       if (!wctx->use_waterfall)
+               return value;
+
+       wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder);
+
+       ac_build_endif(&ctx->ac, 6001);
+
+       if (value) {
+               phi_src[0] = LLVMGetUndef(LLVMTypeOf(value));
+               phi_src[1] = value;
+
+               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb);
+       }
+
+       /*
+        * By using the optimization barrier on the exit decision, we decouple
+        * the operations from the break, and hence avoid LLVM hoisting the
+        * opteration into the break block.
+        */
+       LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb);
+       ac_build_optimization_barrier(&ctx->ac, &cc);
+
+       LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2");
+       ac_build_ifcc(&ctx->ac, active, 6002);
+       ac_build_break(&ctx->ac);
+       ac_build_endif(&ctx->ac, 6002);
+
+       ac_build_endloop(&ctx->ac, 6000);
+       return ret;
+}
+
  static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
  {
         LLVMValueRef src[4], result = NULL;
         unsigned num_components = instr->dest.dest.ssa.num_components;
         unsigned src_components;
         LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
+       bool saved_inexact = false;
+
+       if (instr->exact)
+               saved_inexact = ac_disable_inexact_math(ctx->ac.builder);
  
         assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
         switch (instr->op) {
@@ -595,8 +704,15 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                 result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
                 break;
         case nir_op_frcp:
-               src[0] = ac_to_float(&ctx->ac, src[0]);
-               result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
+               /* For doubles, we need precise division to pass GLCTS. */
+               if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL &&
+                   ac_get_type_size(def_type) == 8) {
+                       result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1,
+                                              ac_to_float(&ctx->ac, src[0]), "");
+               } else {
+                       result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rcp",
+                                                     ac_to_float_type(&ctx->ac, def_type), src[0]);
+               }
                 break;
         case nir_op_iand:
                 result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
@@ -741,9 +857,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
                 break;
         case nir_op_frsq:
-               result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
-                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
-               result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result);
+               result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rsq",
+                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
                 break;
         case nir_op_frexp_exp:
                 src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -846,15 +961,45 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                 result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
                 break;
         case nir_op_f2f16_rtz:
+       case nir_op_f2f16:
+       case nir_op_f2fmp:
                 src[0] = ac_to_float(&ctx->ac, src[0]);
-               if (LLVMTypeOf(src[0]) == ctx->ac.f64)
-                       src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
-               LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
-               result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-               result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+
+               /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it,
+                * all f32->f16 conversions have to round towards zero, because both scalar
+                * and vec2 down-conversions have to round equally.
+                */
+               if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL ||
+                   instr->op == nir_op_f2f16_rtz) {
+                       src[0] = ac_to_float(&ctx->ac, src[0]);
+
+                       if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+                               src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
+
+                       /* Fast path conversion. This only works if NIR is vectorized
+                        * to vec2 16.
+                        */
+                       if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
+                               LLVMValueRef args[] = {
+                                       ac_llvm_extract_elem(&ctx->ac, src[0], 0),
+                                       ac_llvm_extract_elem(&ctx->ac, src[0], 1),
+                               };
+                               result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
+                               break;
+                       }
+
+                       assert(ac_get_llvm_num_components(src[0]) == 1);
+                       LLVMValueRef param[2] = { src[0], LLVMGetUndef(ctx->ac.f32) };
+                       result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+                       result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+               } else {
+                       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+                               result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+                       else
+                               result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+               }
                 break;
         case nir_op_f2f16_rtne:
-       case nir_op_f2f16:
         case nir_op_f2f32:
         case nir_op_f2f64:
                 src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -865,6 +1010,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                 break;
         case nir_op_u2u8:
         case nir_op_u2u16:
+       case nir_op_u2ump:
         case nir_op_u2u32:
         case nir_op_u2u64:
                 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -874,6 +1020,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                 break;
         case nir_op_i2i8:
         case nir_op_i2i16:
+       case nir_op_i2imp:
         case nir_op_i2i32:
         case nir_op_i2i64:
                 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -1090,6 +1237,9 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                 result = ac_to_integer_or_pointer(&ctx->ac, result);
                 ctx->ssa_defs[instr->dest.dest.ssa.index] = result;
         }
+
+       if (instr->exact)
+               ac_restore_inexact_math(ctx->ac.builder, saved_inexact);
  }
  
  static void visit_load_const(struct ac_nir_context *ctx,
@@ -1336,12 +1486,14 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
         if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
                 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
  
+               assert(instr->dest.is_ssa);
                 return ac_build_buffer_load_format(&ctx->ac,
                                                    args->resource,
                                                    args->coords[0],
                                                    ctx->ac.i32_0,
                                                    util_last_bit(mask),
-                                                  0, true);
+                                                  0, true,
+                                                  instr->dest.ssa.bit_size == 16);
         }
  
         args->opcode = ac_image_sample;
@@ -1370,11 +1522,17 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
                 break;
         case nir_texop_tg4:
                 args->opcode = ac_image_gather4;
-               args->level_zero = true;
+                if (!args->lod && !args->bias)
+                       args->level_zero = true;
                 break;
         case nir_texop_lod:
                 args->opcode = ac_image_get_lod;
                 break;
+       case nir_texop_fragment_fetch:
+       case nir_texop_fragment_mask_fetch:
+               args->opcode = ac_image_load;
+               args->level_zero = false;
+               break;
         default:
                 break;
         }
@@ -1468,13 +1626,13 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
  
         if (instr->dest.ssa.bit_size == 8) {
                 unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
-               LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords);
                 ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
                 LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
  
                 LLVMValueRef params[3];
                 if (load_dwords > 1) {
-                       LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+                       LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, "");
                         params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
                         params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
                 } else {
@@ -1487,11 +1645,11 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
  
                 res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
                 if (instr->dest.ssa.num_components > 1)
-                       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), "");
+                       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), "");
                 return res;
         } else if (instr->dest.ssa.bit_size == 16) {
                 unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
-               LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords);
                 ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
                 LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
                 res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
@@ -1575,14 +1733,29 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
         }
  
         if (access & ACCESS_STREAM_CACHE_POLICY)
-               cache_policy |= ac_slc;
+               cache_policy |= ac_slc | ac_glc;
  
         return cache_policy;
  }
  
+static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx,
+                                        struct waterfall_context *wctx,
+                                        const nir_intrinsic_instr *instr,
+                                        nir_src src)
+{
+       return enter_waterfall(ctx, wctx, get_src(ctx, src),
+                              nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
  static void visit_store_ssbo(struct ac_nir_context *ctx,
                               nir_intrinsic_instr *instr)
  {
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7000);
+        }
+
         LLVMValueRef src_data = get_src(ctx, instr->src[0]);
         int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
         unsigned writemask = nir_intrinsic_write_mask(instr);
@@ -1590,8 +1763,10 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
         bool writeonly_memory = access & ACCESS_NON_READABLE;
         unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
  
-       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-                                       get_src(ctx, instr->src[1]), true);
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
+
+       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true);
         LLVMValueRef base_data = src_data;
         base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
         LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
@@ -1626,6 +1801,16 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
                         count = 1;
                         num_bytes = 2;
                 }
+
+               /* Due to alignment issues, split stores of 8-bit/16-bit
+                * vectors.
+                */
+               if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) {
+                       writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
+                       count = 1;
+                       num_bytes = elem_size_bytes;
+               }
+
                 data = extract_vector_range(&ctx->ac, base_data, start, count);
  
                 offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
@@ -1666,6 +1851,11 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
                                                     cache_policy);
                 }
         }
+
+       exit_waterfall(ctx, &wctx, NULL);
+
+       if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7000);
  }
  
  static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
@@ -1727,14 +1917,24 @@ static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
  }
  
  static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
  {
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7001);
+        }
+
         LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2]));
         const char *op;
         char name[64], type[8];
         LLVMValueRef params[6], descriptor;
+       LLVMValueRef result;
         int arg_count = 0;
  
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
         switch (instr->intrinsic) {
         case nir_intrinsic_ssbo_atomic_add:
                 op = "add";
@@ -1771,58 +1971,66 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
         }
  
         descriptor = ctx->abi->load_ssbo(ctx->abi,
-                                        get_src(ctx, instr->src[0]),
+                                        rsrc_base,
                                          true);
  
         if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap &&
             return_type == ctx->ac.i64) {
-               return emit_ssbo_comp_swap_64(ctx, descriptor,
-                                             get_src(ctx, instr->src[1]),
-                                             get_src(ctx, instr->src[2]),
-                                             get_src(ctx, instr->src[3]));
-       }
-       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
-               params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
-       }
-       params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
-       params[arg_count++] = descriptor;
+               result = emit_ssbo_comp_swap_64(ctx, descriptor,
+                                               get_src(ctx, instr->src[1]),
+                                               get_src(ctx, instr->src[2]),
+                                               get_src(ctx, instr->src[3]));
+       } else {
+               if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
+                       params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
+               }
+               params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+               params[arg_count++] = descriptor;
  
-       if (LLVM_VERSION_MAJOR >= 9) {
-               /* XXX: The new raw/struct atomic intrinsics are buggy with
-                * LLVM 8, see r358579.
-                */
-               params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-               params[arg_count++] = ctx->ac.i32_0; /* soffset */
-               params[arg_count++] = ctx->ac.i32_0; /* slc */
+               if (LLVM_VERSION_MAJOR >= 9) {
+                       /* XXX: The new raw/struct atomic intrinsics are buggy with
+                       * LLVM 8, see r358579.
+                       */
+                       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+                       params[arg_count++] = ctx->ac.i32_0; /* soffset */
+                       params[arg_count++] = ctx->ac.i32_0; /* slc */
+
+                       ac_build_type_name_for_intr(return_type, type, sizeof(type));
+                       snprintf(name, sizeof(name),
+                                "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
+               } else {
+                       params[arg_count++] = ctx->ac.i32_0; /* vindex */
+                       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+                       params[arg_count++] = ctx->ac.i1false; /* slc */
  
-               ac_build_type_name_for_intr(return_type, type, sizeof(type));
-               snprintf(name, sizeof(name),
-                        "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
-       } else {
-               params[arg_count++] = ctx->ac.i32_0; /* vindex */
-               params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-               params[arg_count++] = ctx->ac.i1false; /* slc */
+                       assert(return_type == ctx->ac.i32);
+                       snprintf(name, sizeof(name),
+                                "llvm.amdgcn.buffer.atomic.%s", op);
+               }
  
-               assert(return_type == ctx->ac.i32);
-               snprintf(name, sizeof(name),
-                        "llvm.amdgcn.buffer.atomic.%s", op);
+               result = ac_build_intrinsic(&ctx->ac, name, return_type, params,
+                                           arg_count, 0);
         }
  
-       return ac_build_intrinsic(&ctx->ac, name, return_type, params,
-                                 arg_count, 0);
+       result = exit_waterfall(ctx, &wctx, result);
+        if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7001);
+       return result;
  }
  
  static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
  {
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
         int elem_size_bytes = instr->dest.ssa.bit_size / 8;
         int num_components = instr->num_components;
         enum gl_access_qualifier access = nir_intrinsic_access(instr);
         unsigned cache_policy = get_cache_policy(ctx, access, false, false);
  
         LLVMValueRef offset = get_src(ctx, instr->src[1]);
-       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-                                               get_src(ctx, instr->src[0]), false);
+       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false);
         LLVMValueRef vindex = ctx->ac.i32_0;
  
         LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
@@ -1877,14 +2085,26 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
                 i += num_elems;
         }
  
-       return ac_build_gather_values(&ctx->ac, results, num_components);
+       LLVMValueRef ret =  ac_build_gather_values(&ctx->ac, results, num_components);
+       return exit_waterfall(ctx, &wctx, ret);
+}
+
+static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx,
+                                       struct waterfall_context *wctx,
+                                       const nir_intrinsic_instr *instr)
+{
+       return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]),
+                              nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
  }
  
  static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
-                                          const nir_intrinsic_instr *instr)
+                                          nir_intrinsic_instr *instr)
  {
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr);
+
         LLVMValueRef ret;
-       LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
+       LLVMValueRef rsrc = rsrc_base;
         LLVMValueRef offset = get_src(ctx, instr->src[1]);
         int num_components = instr->num_components;
  
@@ -1926,8 +2146,10 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
                 ret = ac_trim_vector(&ctx->ac, ret, num_components);
         }
  
-       return LLVMBuildBitCast(ctx->ac.builder, ret,
+       ret = LLVMBuildBitCast(ctx->ac.builder, ret,
                                 get_def_type(ctx, &instr->dest.ssa), "");
+
+       return exit_waterfall(ctx, &wctx, ret);
  }
  
  static void
@@ -2177,20 +2399,28 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
                 break;
         case nir_var_mem_global:  {
                 LLVMValueRef address = get_src(ctx, instr->src[0]);
+               LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
                 unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
                 unsigned natural_stride = type_scalar_size_bytes(deref->type);
                 unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+               int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8;
+               bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
  
-               LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
-               if (stride != natural_stride) {
-                       LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(result_type),
-                                                               LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+               if (stride != natural_stride || split_loads) {
+                       if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind)
+                               result_type = LLVMGetElementType(result_type);
+
+                       LLVMTypeRef ptr_type = LLVMPointerType(result_type,
+                                                              LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                         address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
  
                         for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) {
                                 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0);
                                 values[i] = LLVMBuildLoad(ctx->ac.builder,
                                                           ac_build_gep_ptr(&ctx->ac, address, offset), "");
+
+                               if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                                       LLVMSetOrdering(values[i], LLVMAtomicOrderingMonotonic);
                         }
                         return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components);
                 } else {
@@ -2198,6 +2428,9 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
                                                                 LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                         address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
                         LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
+
+                       if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                               LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
                         return val;
                 }
         }
@@ -2212,6 +2445,12 @@ static void
  visit_store_var(struct ac_nir_context *ctx,
                 nir_intrinsic_instr *instr)
  {
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7002);
+        }
+
         nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
         nir_variable *var = nir_deref_instr_get_variable(deref);
  
@@ -2266,7 +2505,7 @@ visit_store_var(struct ac_nir_context *ctx,
                         ctx->abi->store_tcs_outputs(ctx->abi, var,
                                                     vertex_index, indir_index,
                                                     const_index, src, writemask);
-                       return;
+                       break;
                 }
  
                 for (unsigned chan = 0; chan < 8; chan++) {
@@ -2332,23 +2571,32 @@ visit_store_var(struct ac_nir_context *ctx,
                 unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
                 unsigned natural_stride = type_scalar_size_bytes(deref->type);
                 unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+               int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8;
+               bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
  
                 LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
                                                         LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                 address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
  
                 if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 &&
-                   stride == natural_stride) {
-                       LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
-                                                               LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+                   stride == natural_stride && !split_stores) {
+                       LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val),
+                                                              LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                         address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
  
                         val = LLVMBuildBitCast(ctx->ac.builder, val,
                                                LLVMGetElementType(LLVMTypeOf(address)), "");
-                       LLVMBuildStore(ctx->ac.builder, val, address);
+                       LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, val, address);
+
+                       if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                               LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
                 } else {
-                       LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(LLVMTypeOf(val)),
-                                                               LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+                       LLVMTypeRef val_type = LLVMTypeOf(val);
+                       if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind)
+                               val_type = LLVMGetElementType(val_type);
+
+                       LLVMTypeRef ptr_type = LLVMPointerType(val_type,
+                                                              LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                         address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
                         for (unsigned chan = 0; chan < 4; chan++) {
                                 if (!(writemask & (1 << chan)))
@@ -2361,7 +2609,10 @@ visit_store_var(struct ac_nir_context *ctx,
                                                                         chan);
                                 src = LLVMBuildBitCast(ctx->ac.builder, src,
                                                        LLVMGetElementType(LLVMTypeOf(ptr)), "");
-                               LLVMBuildStore(ctx->ac.builder, src, ptr);
+                               LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, src, ptr);
+
+                               if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                                       LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
                         }
                 }
                 break;
@@ -2370,6 +2621,9 @@ visit_store_var(struct ac_nir_context *ctx,
                 abort();
                 break;
         }
+
+       if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7002);
  }
  
  static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
@@ -2419,6 +2673,7 @@ static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
  
  static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                                           const nir_intrinsic_instr *instr,
+                                         LLVMValueRef dynamic_index,
                                           enum ac_descriptor_type desc_type,
                                           bool write)
  {
@@ -2426,11 +2681,12 @@ static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                 instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ?
                 nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL;
  
-       return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, true, write);
+       return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write);
  }
  
  static void get_image_coords(struct ac_nir_context *ctx,
                              const nir_intrinsic_instr *instr,
+                            LLVMValueRef dynamic_desc_index,
                              struct ac_image_args *args,
                              enum glsl_sampler_dim dim,
                              bool is_array)
@@ -2468,7 +2724,7 @@ static void get_image_coords(struct ac_nir_context *ctx,
                                                                fmask_load_address[2],
                                                                sample_index,
                                                                get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-                                                                               AC_DESC_FMASK, &instr->instr, true, false));
+                                                                               AC_DESC_FMASK, &instr->instr, dynamic_desc_index, true, false));
         }
         if (count == 1 && !gfx9_1d) {
                 if (instr->src[1].ssa->num_components)
@@ -2520,9 +2776,10 @@ static void get_image_coords(struct ac_nir_context *ctx,
  
  static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
                                                  const nir_intrinsic_instr *instr,
+                                               LLVMValueRef dynamic_index,
                                                 bool write, bool atomic)
  {
-       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
+       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write);
         if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) {
                 LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
                 LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
@@ -2538,6 +2795,19 @@ static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
         return rsrc;
  }
  
+static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
+                                         struct waterfall_context *wctx,
+                                         const nir_intrinsic_instr *instr)
+{
+       nir_deref_instr *deref_instr = NULL;
+
+       if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref)
+               deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+
+       LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
+       return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
  static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                                      const nir_intrinsic_instr *instr,
                                      bool bindless)
@@ -2545,21 +2815,23 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
         LLVMValueRef res;
  
         enum glsl_sampler_dim dim;
-       enum gl_access_qualifier access;
+       enum gl_access_qualifier access = nir_intrinsic_access(instr);
         bool is_array;
         if (bindless) {
                 dim = nir_intrinsic_image_dim(instr);
-               access = nir_intrinsic_access(instr);
                 is_array = nir_intrinsic_image_array(instr);
         } else {
                 const nir_deref_instr *image_deref = get_image_deref(instr);
                 const struct glsl_type *type = image_deref->type;
                 const nir_variable *var = nir_deref_instr_get_variable(image_deref);
                 dim = glsl_get_sampler_dim(type);
-               access = var->data.access;
+               access |= var->data.access;
                 is_array = glsl_sampler_type_is_array(type);
         }
  
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
         struct ac_image_args args = {};
  
         args.cache_policy = get_cache_policy(ctx, access, false, false);
@@ -2569,61 +2841,77 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                 unsigned num_channels = util_last_bit(mask);
                 LLVMValueRef rsrc, vindex;
  
-               rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
+               rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false);
                 vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                  ctx->ac.i32_0, "");
  
+               assert(instr->dest.is_ssa);
                 bool can_speculate = access & ACCESS_CAN_REORDER;
                 res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex,
                                                   ctx->ac.i32_0, num_channels,
                                                   args.cache_policy,
-                                                 can_speculate);
+                                                 can_speculate,
+                                                 instr->dest.ssa.bit_size == 16);
                 res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
  
                 res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
                 res = ac_to_integer(&ctx->ac, res);
         } else {
-               args.opcode = ac_image_load;
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
+
+               args.opcode = level_zero ? ac_image_load : ac_image_load_mip;
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+               if (!level_zero)
+                       args.lod = get_src(ctx, instr->src[3]);
                 args.dmask = 15;
                 args.attributes = AC_FUNC_ATTR_READONLY;
  
+               assert(instr->dest.is_ssa);
+               args.d16 = instr->dest.ssa.bit_size == 16;
+
                 res = ac_build_image_opcode(&ctx->ac, &args);
         }
-       return res;
+       return exit_waterfall(ctx, &wctx, res);
  }
  
  static void visit_image_store(struct ac_nir_context *ctx,
-                             nir_intrinsic_instr *instr,
+                             const nir_intrinsic_instr *instr,
                               bool bindless)
  {
-
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7003);
+        }
  
         enum glsl_sampler_dim dim;
-       enum gl_access_qualifier access;
+       enum gl_access_qualifier access = nir_intrinsic_access(instr);
         bool is_array;
+
         if (bindless) {
                 dim = nir_intrinsic_image_dim(instr);
-               access = nir_intrinsic_access(instr);
                 is_array = nir_intrinsic_image_array(instr);
         } else {
                 const nir_deref_instr *image_deref = get_image_deref(instr);
                 const struct glsl_type *type = image_deref->type;
                 const nir_variable *var = nir_deref_instr_get_variable(image_deref);
                 dim = glsl_get_sampler_dim(type);
-               access = var->data.access;
+               access |= var->data.access;
                 is_array = glsl_sampler_type_is_array(type);
         }
  
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
         bool writeonly_memory = access & ACCESS_NON_READABLE;
         struct ac_image_args args = {};
  
         args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
  
         if (dim == GLSL_SAMPLER_DIM_BUF) {
-               LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
+               LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false);
                 LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
                 unsigned src_channels = ac_get_llvm_num_components(src);
                 LLVMValueRef vindex;
@@ -2636,25 +2924,38 @@ static void visit_image_store(struct ac_nir_context *ctx,
                                                  ctx->ac.i32_0, "");
  
                 ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex,
-                                            ctx->ac.i32_0, src_channels,
-                                            args.cache_policy);
+                                            ctx->ac.i32_0, args.cache_policy);
         } else {
-               args.opcode = ac_image_store;
+               bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
+
+               args.opcode = level_zero ? ac_image_store : ac_image_store_mip;
                 args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+               if (!level_zero)
+                       args.lod = get_src(ctx, instr->src[4]);
                 args.dmask = 15;
+               args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16;
  
                 ac_build_image_opcode(&ctx->ac, &args);
         }
  
+       exit_waterfall(ctx, &wctx, NULL);
+       if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7003);
  }
  
  static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr,
-                                       bool bindless)
+                                    const nir_intrinsic_instr *instr,
+                                    bool bindless)
  {
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7004);
+        }
+
         LLVMValueRef params[7];
         int param_count = 0;
  
@@ -2683,6 +2984,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                 is_array = glsl_sampler_type_is_array(type);
         }
  
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
         switch (instr->intrinsic) {
         case nir_intrinsic_bindless_image_atomic_add:
         case nir_intrinsic_image_deref_atomic_add:
@@ -2738,16 +3042,6 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
         case nir_intrinsic_image_deref_atomic_inc_wrap: {
                 atomic_name = "inc";
                 atomic_subop = ac_atomic_inc_wrap;
-               /* ATOMIC_INC instruction does:
-                *      value = (value + 1) % (data + 1)
-                * but we want:
-                *      value = (value + 1) % data
-                * So replace 'data' by 'data - 1'.
-                */
-               ctx->ssa_defs[instr->src[3].ssa->index] =
-                       LLVMBuildSub(ctx->ac.builder,
-                                    ctx->ssa_defs[instr->src[3].ssa->index],
-                                    ctx->ac.i32_1, "");
                 break;
         }
         case nir_intrinsic_bindless_image_atomic_dec_wrap:
@@ -2763,8 +3057,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                 params[param_count++] = get_src(ctx, instr->src[4]);
         params[param_count++] = get_src(ctx, instr->src[3]);
  
+       LLVMValueRef result;
         if (dim == GLSL_SAMPLER_DIM_BUF) {
-               params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
+               params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true);
                 params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                                 ctx->ac.i32_0, ""); /* vindex */
                 params[param_count++] = ctx->ac.i32_0; /* voffset */
@@ -2785,8 +3080,8 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                 }
  
                 assert(length < sizeof(intrinsic_name));
-               return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32,
-                                         params, param_count, 0);
+               result = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32,
+                                           params, param_count, 0);
         } else {
                 struct ac_image_args args = {};
                 args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic;
@@ -2794,20 +3089,29 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                 args.data[0] = params[0];
                 if (cmpswap)
                         args.data[1] = params[1];
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
  
-               return ac_build_image_opcode(&ctx->ac, &args);
+               result = ac_build_image_opcode(&ctx->ac, &args);
         }
+
+       result = exit_waterfall(ctx, &wctx, result);
+       if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7004);
+       return result;
  }
  
  static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr)
+                                       nir_intrinsic_instr *instr)
  {
-       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
  
-       return ac_build_image_get_sample_count(&ctx->ac, rsrc);
+       LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
+
+       return exit_waterfall(ctx, &wctx, ret);
  }
  
  static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
@@ -2827,35 +3131,40 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
                 is_array = glsl_sampler_type_is_array(type);
         }
  
-       if (dim == GLSL_SAMPLER_DIM_BUF)
-               return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true);
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
  
-       struct ac_image_args args = { 0 };
+       if (dim == GLSL_SAMPLER_DIM_BUF) {
+               res =  get_buffer_size(ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true);
+       } else {
  
-       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
-       args.dmask = 0xf;
-       args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-       args.opcode = ac_image_get_resinfo;
-       args.lod = ctx->ac.i32_0;
-       args.attributes = AC_FUNC_ATTR_READNONE;
+               struct ac_image_args args = { 0 };
  
-       res = ac_build_image_opcode(&ctx->ac, &args);
+               args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+               args.dmask = 0xf;
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+               args.opcode = ac_image_get_resinfo;
+               args.lod = ctx->ac.i32_0;
+               args.attributes = AC_FUNC_ATTR_READNONE;
  
-       LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+               res = ac_build_image_opcode(&ctx->ac, &args);
  
-       if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
-               LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
-               LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-               z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
-               res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
-       }
-       if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
-               LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-               res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
-                                               ctx->ac.i32_1, "");
+               LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
  
+               if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
+                       LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
+                       LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+                       z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
+                       res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
+               }
+
+               if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
+                       LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+                       res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
+                                                    ctx->ac.i32_1, "");
+               }
         }
-       return res;
+       return exit_waterfall(ctx, &wctx, res);
  }
  
  static void emit_membar(struct ac_llvm_context *ac,
@@ -2868,7 +3177,6 @@ static void emit_membar(struct ac_llvm_context *ac,
         case nir_intrinsic_group_memory_barrier:
                 wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE;
                 break;
-       case nir_intrinsic_memory_barrier_atomic_counter:
         case nir_intrinsic_memory_barrier_buffer:
         case nir_intrinsic_memory_barrier_image:
                 wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE;
@@ -2910,7 +3218,30 @@ static void emit_discard(struct ac_nir_context *ctx,
                 cond = ctx->ac.i1false;
         }
  
-       ctx->abi->emit_kill(ctx->abi, cond);
+       ac_build_kill_if_false(&ctx->ac, cond);
+}
+
+static void emit_demote(struct ac_nir_context *ctx,
+                       const nir_intrinsic_instr *instr)
+{
+       LLVMValueRef cond;
+
+       if (instr->intrinsic == nir_intrinsic_demote_if) {
+               cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
+                                    get_src(ctx, instr->src[0]),
+                                    ctx->ac.i32_0, "");
+       } else {
+               assert(instr->intrinsic == nir_intrinsic_demote);
+               cond = ctx->ac.i1false;
+       }
+
+       /* Kill immediately while maintaining WQM. */
+       ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond));
+
+       LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
+       mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, "");
+       LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill);
+       return;
  }
  
  static LLVMValueRef
@@ -3017,11 +3348,28 @@ static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx,
                                      const nir_intrinsic_instr *instr,
                                      LLVMValueRef ptr, int src_idx)
  {
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7005);
+        }
+
         LLVMValueRef result;
         LLVMValueRef src = get_src(ctx, instr->src[src_idx]);
  
         const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
  
+       if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) {
+               nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+               if (deref->mode == nir_var_mem_global) {
+                       /* use "singlethread" sync scope to implement relaxed ordering */
+                       sync_scope = LLVM_VERSION_MAJOR >= 9 ? "singlethread-one-as" : "singlethread";
+
+                       LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(src), LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)));
+                       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type , "");
+               }
+       }
+
         if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap ||
             instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) {
                 LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]);
@@ -3066,12 +3414,30 @@ static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx,
                 case nir_intrinsic_deref_atomic_exchange:
                         op = LLVMAtomicRMWBinOpXchg;
                         break;
+#if LLVM_VERSION_MAJOR >= 10
+               case nir_intrinsic_shared_atomic_fadd:
+               case nir_intrinsic_deref_atomic_fadd:
+                       op = LLVMAtomicRMWBinOpFAdd;
+                       break;
+#endif
                 default:
                         return NULL;
                 }
  
-               result = ac_build_atomic_rmw(&ctx->ac, op, ptr, ac_to_integer(&ctx->ac, src), sync_scope);
+               LLVMValueRef val;
+
+               if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd ||
+                   instr->intrinsic == nir_intrinsic_deref_atomic_fadd) {
+                       val = ac_to_float(&ctx->ac, src);
+               } else {
+                       val = ac_to_integer(&ctx->ac, src);
+               }
+
+               result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope);
         }
+
+       if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7005);
         return result;
  }
  
@@ -3205,6 +3571,13 @@ static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx,
         return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
  }
  
+static LLVMValueRef barycentric_model(struct ac_nir_context *ctx)
+{
+       return LLVMBuildBitCast(ctx->ac.builder,
+                               ac_get_arg(&ctx->ac, ctx->args->pull_model),
+                               ctx->ac.v3i32, "");
+}
+
  static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
                                             LLVMValueRef interp_param,
                                             unsigned index, unsigned comp_start,
@@ -3212,13 +3585,26 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
                                             unsigned bitsize)
  {
         LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+        LLVMValueRef interp_param_f;
  
-       interp_param = LLVMBuildBitCast(ctx->ac.builder,
+       interp_param_f = LLVMBuildBitCast(ctx->ac.builder,
                                 interp_param, ctx->ac.v2f32, "");
         LLVMValueRef i = LLVMBuildExtractElement(
-               ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+               ctx->ac.builder, interp_param_f, ctx->ac.i32_0, "");
         LLVMValueRef j = LLVMBuildExtractElement(
-               ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+               ctx->ac.builder, interp_param_f, ctx->ac.i32_1, "");
+
+       /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
+       if (ctx->verified_interp &&
+            !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
+               LLVMValueRef args[2];
+               args[0] = i;
+               args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
+               LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
+                                                       args, 2, AC_FUNC_ATTR_READNONE);
+               ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
+                _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
+       }
  
         LLVMValueRef values[4];
         assert(bitsize == 16 || bitsize == 32);
@@ -3236,25 +3622,53 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
         return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components));
  }
  
-static LLVMValueRef load_flat_input(struct ac_nir_context *ctx,
-                                   unsigned index, unsigned comp_start,
-                                   unsigned num_components,
-                                   unsigned bit_size)
+static LLVMValueRef load_input(struct ac_nir_context *ctx,
+                              nir_intrinsic_instr *instr)
  {
-       LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+       unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
+
+       /* We only lower inputs for fragment shaders ATM */
+       ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[offset_idx]);
+       assert(offset);
+       assert(offset[0].i32 == 0);
+
+       unsigned component = nir_intrinsic_component(instr);
+       unsigned index = nir_intrinsic_base(instr);
+       unsigned vertex_id = 2; /* P0 */
+
+       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
+               nir_const_value *src0 = nir_src_as_const_value(instr->src[0]);
  
+               switch (src0[0].i32) {
+               case 0:
+                       vertex_id = 2;
+                       break;
+               case 1:
+                       vertex_id = 0;
+                       break;
+               case 2:
+                       vertex_id = 1;
+                       break;
+               default:
+                       unreachable("Invalid vertex index");
+               }
+       }
+
+       LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
         LLVMValueRef values[8];
  
         /* Each component of a 64-bit value takes up two GL-level channels. */
+       unsigned num_components = instr->dest.ssa.num_components;
+       unsigned bit_size = instr->dest.ssa.bit_size;
         unsigned channels =
                 bit_size == 64 ? num_components * 2 : num_components;
  
         for (unsigned chan = 0; chan < channels; chan++) {
-               if (comp_start + chan > 4)
+               if (component + chan > 4)
                         attr_number = LLVMConstInt(ctx->ac.i32, index + 1, false);
-               LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (comp_start + chan) % 4, false);
+               LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false);
                 values[chan] = ac_build_fs_interp_mov(&ctx->ac,
-                                                     LLVMConstInt(ctx->ac.i32, 2, false),
+                                                     LLVMConstInt(ctx->ac.i32, vertex_id, false),
                                                       llvm_chan,
                                                       attr_number,
                                                       ac_get_arg(&ctx->ac, ctx->args->prim_mask));
@@ -3390,6 +3804,9 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
         case nir_intrinsic_load_helper_invocation:
                 result = ac_build_load_helper_invocation(&ctx->ac);
                 break;
+       case nir_intrinsic_is_helper_invocation:
+               result = ac_build_is_helper_invocation(&ctx->ac);
+               break;
         case nir_intrinsic_load_color0:
                 result = ctx->abi->color0;
                 break;
@@ -3520,21 +3937,46 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                 result = visit_image_size(ctx, instr, false);
                 break;
         case nir_intrinsic_shader_clock:
-               result = ac_build_shader_clock(&ctx->ac);
+               result = ac_build_shader_clock(&ctx->ac,
+                                              nir_intrinsic_memory_scope(instr));
                 break;
         case nir_intrinsic_discard:
         case nir_intrinsic_discard_if:
                 emit_discard(ctx, instr);
                 break;
+        case nir_intrinsic_demote:
+        case nir_intrinsic_demote_if:
+               emit_demote(ctx, instr);
+               break;
         case nir_intrinsic_memory_barrier:
         case nir_intrinsic_group_memory_barrier:
-       case nir_intrinsic_memory_barrier_atomic_counter:
         case nir_intrinsic_memory_barrier_buffer:
         case nir_intrinsic_memory_barrier_image:
         case nir_intrinsic_memory_barrier_shared:
                 emit_membar(&ctx->ac, instr);
                 break;
-       case nir_intrinsic_barrier:
+       case nir_intrinsic_scoped_barrier: {
+               assert(!(nir_intrinsic_memory_semantics(instr) &
+                        (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+               nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
+
+               unsigned wait_flags = 0;
+               if (modes & (nir_var_mem_global | nir_var_mem_ssbo))
+                       wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
+               if (modes & nir_var_mem_shared)
+                       wait_flags |= AC_WAIT_LGKM;
+
+               if (wait_flags)
+                       ac_build_waitcnt(&ctx->ac, wait_flags);
+
+               if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP)
+                       ac_emit_barrier(&ctx->ac, ctx->stage);
+               break;
+       }
+       case nir_intrinsic_memory_barrier_tcs_patch:
+               break;
+       case nir_intrinsic_control_barrier:
                 ac_emit_barrier(&ctx->ac, ctx->stage);
                 break;
         case nir_intrinsic_shared_atomic_add:
@@ -3546,7 +3988,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
         case nir_intrinsic_shared_atomic_or:
         case nir_intrinsic_shared_atomic_xor:
         case nir_intrinsic_shared_atomic_exchange:
-       case nir_intrinsic_shared_atomic_comp_swap: {
+       case nir_intrinsic_shared_atomic_comp_swap:
+       case nir_intrinsic_shared_atomic_fadd: {
                 LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0],
                                                   instr->src[1].ssa->bit_size);
                 result = visit_var_atomic(ctx, instr, ptr, 1);
@@ -3561,7 +4004,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
         case nir_intrinsic_deref_atomic_or:
         case nir_intrinsic_deref_atomic_xor:
         case nir_intrinsic_deref_atomic_exchange:
-       case nir_intrinsic_deref_atomic_comp_swap: {
+       case nir_intrinsic_deref_atomic_comp_swap:
+       case nir_intrinsic_deref_atomic_fadd: {
                 LLVMValueRef ptr = get_src(ctx, instr->src[0]);
                 result = visit_var_atomic(ctx, instr, ptr, 1);
                 break;
@@ -3575,6 +4019,9 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
         case nir_intrinsic_load_barycentric_sample:
                 result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr));
                 break;
+       case nir_intrinsic_load_barycentric_model:
+               result = barycentric_model(ctx);
+               break;
         case nir_intrinsic_load_barycentric_at_offset: {
                 LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
                 result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset);
@@ -3600,23 +4047,23 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                                                  instr->dest.ssa.bit_size);
                 break;
         }
-       case nir_intrinsic_load_input: {
-               /* We only lower inputs for fragment shaders ATM */
-               ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[0]);
-               assert(offset);
-               assert(offset[0].i32 == 0);
-
-               unsigned index = nir_intrinsic_base(instr);
-               unsigned component = nir_intrinsic_component(instr);
-               result = load_flat_input(ctx, index, component,
-                                        instr->dest.ssa.num_components,
-                                        instr->dest.ssa.bit_size);
+       case nir_intrinsic_load_input:
+       case nir_intrinsic_load_input_vertex:
+               result = load_input(ctx, instr);
                 break;
-       }
         case nir_intrinsic_emit_vertex:
                 ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
                 break;
+       case nir_intrinsic_emit_vertex_with_counter: {
+               unsigned stream = nir_intrinsic_stream_id(instr);
+               LLVMValueRef next_vertex = get_src(ctx, instr->src[0]);
+               ctx->abi->emit_vertex_with_counter(ctx->abi, stream,
+                                                  next_vertex,
+                                                  ctx->abi->outputs);
+               break;
+       }
         case nir_intrinsic_end_primitive:
+       case nir_intrinsic_end_primitive_with_counter:
                 ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
                 break;
         case nir_intrinsic_load_tess_coord:
@@ -3648,8 +4095,33 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                 break;
         }
         case nir_intrinsic_shuffle:
-               result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
-                               get_src(ctx, instr->src[1]));
+               if (ctx->ac.chip_class == GFX8 ||
+                   ctx->ac.chip_class == GFX9 ||
+                   (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) {
+                       result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
+                                                 get_src(ctx, instr->src[1]));
+               } else {
+                       LLVMValueRef src = get_src(ctx, instr->src[0]);
+                       LLVMValueRef index = get_src(ctx, instr->src[1]);
+                       LLVMTypeRef type = LLVMTypeOf(src);
+                       struct waterfall_context wctx;
+                       LLVMValueRef index_val;
+
+                       index_val = enter_waterfall(ctx, &wctx, index, true);
+
+                       src = LLVMBuildZExt(ctx->ac.builder, src,
+                                           ctx->ac.i32, "");
+
+                       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane",
+                                                   ctx->ac.i32,
+                                                   (LLVMValueRef []) { src, index_val }, 2,
+                                                   AC_FUNC_ATTR_READNONE |
+                                                   AC_FUNC_ATTR_CONVERGENT);
+
+                       result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
+
+                       result = exit_waterfall(ctx, &wctx, result);
+               }
                 break;
         case nir_intrinsic_reduce:
                 result = ac_build_reduce(&ctx->ac,
@@ -3747,11 +4219,21 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                 break;
         }
         case nir_intrinsic_load_constant: {
+               unsigned base = nir_intrinsic_base(instr);
+               unsigned range = nir_intrinsic_range(instr);
+
                 LLVMValueRef offset = get_src(ctx, instr->src[0]);
-               LLVMValueRef base = LLVMConstInt(ctx->ac.i32,
-                                                nir_intrinsic_base(instr),
-                                                false);
-               offset = LLVMBuildAdd(ctx->ac.builder, offset, base, "");
+               offset = LLVMBuildAdd(ctx->ac.builder, offset,
+                                     LLVMConstInt(ctx->ac.i32, base, false), "");
+
+               /* Clamp the offset to avoid out-of-bound access because global
+                * instructions can't handle them.
+                */
+               LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false);
+               LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                                                 offset, size, "");
+               offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, "");
+
                 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data,
                                                  offset);
                 LLVMTypeRef comp_type =
@@ -3797,11 +4279,20 @@ static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx,
         return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
  }
  
-static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
-                                    nir_deref_instr *deref_instr,
-                                    enum ac_descriptor_type desc_type,
-                                    const nir_instr *instr,
-                                    bool image, bool write)
+struct sampler_desc_address {
+       unsigned descriptor_set;
+       unsigned base_index; /* binding in vulkan */
+       unsigned constant_index;
+       LLVMValueRef dynamic_index;
+       bool image;
+       bool bindless;
+};
+
+static struct sampler_desc_address
+get_sampler_desc_internal(struct ac_nir_context *ctx,
+                         nir_deref_instr *deref_instr,
+                         const nir_instr *instr,
+                         bool image)
  {
         LLVMValueRef index = NULL;
         unsigned constant_index = 0;
@@ -3874,12 +4365,40 @@ static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
                 } else
                         base_index = deref_instr->var->data.binding;
         }
+       return (struct sampler_desc_address) {
+               .descriptor_set = descriptor_set,
+               .base_index = base_index,
+               .constant_index = constant_index,
+               .dynamic_index = index,
+               .image = image,
+               .bindless = bindless,
+       };
+}
  
+/* Extract any possibly divergent index into a separate value that can be fed
+ * into get_sampler_desc with the same arguments. */
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+                                          nir_deref_instr *deref_instr,
+                                          const nir_instr *instr,
+                                          bool image)
+{
+       struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
+       return addr.dynamic_index;
+}
+
+static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
+                                    nir_deref_instr *deref_instr,
+                                    enum ac_descriptor_type desc_type,
+                                    const nir_instr *instr,
+                                    LLVMValueRef index,
+                                    bool image, bool write)
+{
+       struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
         return ctx->abi->load_sampler_desc(ctx->abi,
-                                         descriptor_set,
-                                         base_index,
-                                         constant_index, index,
-                                         desc_type, image, write, bindless);
+                                         addr.descriptor_set,
+                                         addr.base_index,
+                                         addr.constant_index, index,
+                                         desc_type, addr.image, write, addr.bindless);
  }
  
  /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
@@ -3913,6 +4432,7 @@ static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
  
  static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                            nir_tex_instr *instr,
+                          struct waterfall_context *wctx,
                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
                            LLVMValueRef *fmask_ptr)
  {
@@ -3936,9 +4456,19 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                 }
         }
  
+       LLVMValueRef texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr,
+                                                                   &instr->instr, false);
         if (!sampler_deref_instr)
                 sampler_deref_instr = texture_deref_instr;
  
+        LLVMValueRef sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr,
+                                                                   &instr->instr, false);
+       if (instr->texture_non_uniform)
+               texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true);
+
+       if (instr->sampler_non_uniform)
+               sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true);
+
         enum ac_descriptor_type main_descriptor = instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
  
         if (plane >= 0) {
@@ -3949,16 +4479,26 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                 main_descriptor = AC_DESC_PLANE_0 + plane;
         }
  
-       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, false, false);
+       if (instr->op == nir_texop_fragment_mask_fetch) {
+               /* The fragment mask is fetched from the compressed
+                * multisampled surface.
+                */
+               main_descriptor = AC_DESC_FMASK;
+       }
+
+       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr,
+                                   texture_dynamic_index, false, false);
  
         if (samp_ptr) {
-               *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, false, false);
+               *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr,
+                                            sampler_dynamic_index, false, false);
                 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
         }
         if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
                           instr->op == nir_texop_samples_identical))
-               *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, false, false);
+               *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK,
+                                             &instr->instr, texture_dynamic_index, false, false);
  }
  
  static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
@@ -3977,8 +4517,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
         LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
         LLVMValueRef ddx = NULL, ddy = NULL;
         unsigned offset_src = 0;
+       struct waterfall_context wctx[2] = {{{0}}};
  
-       tex_fetch_ptrs(ctx, instr, &args.resource, &args.sampler, &fmask_ptr);
+       tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr);
  
         for (unsigned i = 0; i < instr->num_srcs; i++) {
                 switch (instr->src[i].src_type) {
@@ -4001,8 +4542,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                         offset_src = i;
                         break;
                 case nir_tex_src_bias:
-                       if (instr->op == nir_texop_txb)
-                               args.bias = get_src(ctx, instr->src[i].src);
+                       args.bias = get_src(ctx, instr->src[i].src);
                         break;
                 case nir_tex_src_lod: {
                         if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
@@ -4022,6 +4562,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                 case nir_tex_src_ddy:
                         ddy = get_src(ctx, instr->src[i].src);
                         break;
+               case nir_tex_src_min_lod:
+                       args.min_lod = get_src(ctx, instr->src[i].src);
+                       break;
                 case nir_tex_src_texture_offset:
                 case nir_tex_src_sampler_offset:
                 case nir_tex_src_plane:
@@ -4037,6 +4580,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
  
         if (instr->op == nir_texop_texture_samples) {
                 LLVMValueRef res, samples, is_msaa;
+               LLVMValueRef default_sample;
+
                 res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
                 samples = LLVMBuildExtractElement(ctx->ac.builder, res,
                                                   LLVMConstInt(ctx->ac.i32, 3, false), "");
@@ -4053,8 +4598,27 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                                        LLVMConstInt(ctx->ac.i32, 0xf, false), "");
                 samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
                                        samples, "");
+
+               if (ctx->abi->robust_buffer_access) {
+                       LLVMValueRef dword1, is_null_descriptor;
+
+                       /* Extract the second dword of the descriptor, if it's
+                        * all zero, then it's a null descriptor.
+                        */
+                       dword1 = LLVMBuildExtractElement(ctx->ac.builder, res,
+                                                        LLVMConstInt(ctx->ac.i32, 1, false), "");
+                       is_null_descriptor =
+                               LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
+                                             LLVMConstInt(ctx->ac.i32, 0, false), "");
+                       default_sample =
+                               LLVMBuildSelect(ctx->ac.builder, is_null_descriptor,
+                                               ctx->ac.i32_0, ctx->ac.i32_1, "");
+               } else {
+                       default_sample = ctx->ac.i32_1;
+               }
+
                 samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
-                                         ctx->ac.i32_1, "");
+                                         default_sample, "");
                 result = samples;
                 goto write_result;
         }
@@ -4166,7 +4730,10 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
             instr->is_array &&
-           instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
+           instr->op != nir_texop_txf &&
+           instr->op != nir_texop_txf_ms &&
+           instr->op != nir_texop_fragment_fetch &&
+           instr->op != nir_texop_fragment_mask_fetch) {
                 args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]);
         }
  
@@ -4185,7 +4752,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
         }
  
         /* Pack sample index */
-       if (instr->op == nir_texop_txf_ms && sample_index)
+       if (sample_index && (instr->op == nir_texop_txf_ms ||
+                            instr->op == nir_texop_fragment_fetch))
                 args.coords[instr->coord_components] = sample_index;
  
         if (instr->op == nir_texop_samples_identical) {
@@ -4204,7 +4772,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
  
         if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ||
              instr->sampler_dim == GLSL_SAMPLER_DIM_MS) &&
-           instr->op != nir_texop_txs) {
+           instr->op != nir_texop_txs &&
+           instr->op != nir_texop_fragment_fetch &&
+           instr->op != nir_texop_fragment_mask_fetch) {
                 unsigned sample_chan = instr->is_array ? 3 : 2;
                 args.coords[sample_chan] = adjust_sample_index_using_fmask(
                         &ctx->ac, args.coords[0], args.coords[1],
@@ -4242,6 +4812,23 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                 args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array);
                 args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
         }
+
+       /* Adjust the number of coordinates because we only need (x,y) for 2D
+        * multisampled images and (x,y,layer) for 2D multisampled layered
+        * images or for multisampled input attachments.
+        */
+       if (instr->op == nir_texop_fragment_mask_fetch) {
+               if (args.dim == ac_image_2dmsaa) {
+                       args.dim = ac_image_2d;
+               } else {
+                       assert(args.dim == ac_image_2darraymsaa);
+                       args.dim = ac_image_2darray;
+               }
+       }
+
+       assert(instr->dest.is_ssa);
+       args.d16 = instr->dest.ssa.bit_size == 16;
+
         result = build_tex_intrinsic(ctx, instr, &args);
  
         if (instr->op == nir_texop_query_levels)
@@ -4273,11 +4860,15 @@ write_result:
         if (result) {
                 assert(instr->dest.is_ssa);
                 result = ac_to_integer(&ctx->ac, result);
+
+               for (int i = ARRAY_SIZE(wctx); --i >= 0;) {
+                       result =  exit_waterfall(ctx, wctx + i, result);
+               }
+
                 ctx->ssa_defs[instr->dest.ssa.index] = result;
         }
  }
  
-
  static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
  {
         LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
@@ -4308,19 +4899,46 @@ static void phi_post_pass(struct ac_nir_context *ctx)
  }
  
  
+static bool is_def_used_in_an_export(const nir_ssa_def* def) {
+       nir_foreach_use(use_src, def) {
+               if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
+                       nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr);
+                       if (instr->intrinsic == nir_intrinsic_store_deref)
+                               return true;
+               } else if (use_src->parent_instr->type == nir_instr_type_alu) {
+                       nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr);
+                       if (instr->op == nir_op_vec4 &&
+                           is_def_used_in_an_export(&instr->dest.dest.ssa)) {
+                               return true;
+                       }
+               }
+       }
+       return false;
+}
+
  static void visit_ssa_undef(struct ac_nir_context *ctx,
                             const nir_ssa_undef_instr *instr)
  {
         unsigned num_components = instr->def.num_components;
         LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
-       LLVMValueRef undef;
  
-       if (num_components == 1)
-               undef = LLVMGetUndef(type);
-       else {
-               undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+       if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) {
+               LLVMValueRef undef;
+
+               if (num_components == 1)
+                       undef = LLVMGetUndef(type);
+               else {
+                       undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+               }
+               ctx->ssa_defs[instr->def.index] = undef;
+       } else {
+               LLVMValueRef zero = LLVMConstInt(type, 0, false);
+               if (num_components > 1) {
+                       zero = ac_build_gather_values_extended(
+                               &ctx->ac, &zero, 4, 0, false, false);
+               }
+               ctx->ssa_defs[instr->def.index] = zero;
         }
-       ctx->ssa_defs[instr->def.index] = undef;
  }
  
  static void visit_jump(struct ac_llvm_context *ctx,
@@ -4665,7 +5283,7 @@ setup_locals(struct ac_nir_context *ctx,
  {
         int i, j;
         ctx->num_locals = 0;
-       nir_foreach_variable(variable, &func->impl->locals) {
+       nir_foreach_function_temp_variable(variable, func->impl) {
                 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
                 variable->data.driver_location = ctx->num_locals * 4;
                 variable->data.location_frac = 0;
@@ -4765,7 +5383,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
  
         ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
  
-       nir_foreach_variable(variable, &nir->outputs)
+       nir_foreach_shader_out_variable(variable, nir)
                 ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
                                              ctx.stage);
  
@@ -4776,6 +5394,10 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
         ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                            _mesa_key_pointer_equal);
  
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ctx.verified_interp = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                                              _mesa_key_pointer_equal);
+
         func = (struct nir_function *)exec_list_get_head(&nir->functions);
  
         nir_index_ssa_defs(func->impl);
@@ -4788,9 +5410,19 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
         if (gl_shader_stage_is_compute(nir->info.stage))
                 setup_shared(&ctx, nir);
  
+       if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_demote) {
+               ctx.ac.postponed_kill = ac_build_alloca_undef(&ctx.ac, ac->i1, "");
+               /* true = don't kill. */
+               LLVMBuildStore(ctx.ac.builder, ctx.ac.i1true, ctx.ac.postponed_kill);
+       }
+
         visit_cf_list(&ctx, &func->impl->body);
         phi_post_pass(&ctx);
  
+       if (ctx.ac.postponed_kill)
+               ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder,
+                                                             ctx.ac.postponed_kill, ""));
+
         if (!gl_shader_stage_is_compute(nir->info.stage))
                 ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS,
                                       ctx.abi->outputs);
@@ -4800,6 +5432,8 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
         ralloc_free(ctx.defs);
         ralloc_free(ctx.phis);
         ralloc_free(ctx.vars);
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ralloc_free(ctx.verified_interp);
  }
  
  bool
@@ -4888,7 +5522,7 @@ scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask,
                                 continue;
  
                         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-                       if (intrin->intrinsic == nir_intrinsic_barrier) {
+                       if (intrin->intrinsic == nir_intrinsic_control_barrier) {
  
                                 /* If we find a barrier in nested control flow put this in the
                                  * too hard basket. In GLSL this is not possible but it is in