ac/llvm: fix amdgcn.rcp for v2f16
[mesa.git] / src / amd / llvm / ac_nir_to_llvm.c
index 03717191e243fd8b897d42e08f7c7c5639255d0f..337ca6605fc824ca4db697040b001686dba34ec8 100644 (file)
@@ -51,6 +51,7 @@ struct ac_nir_context {
        struct hash_table *defs;
        struct hash_table *phis;
        struct hash_table *vars;
+        struct hash_table *verified_interp;
 
        LLVMValueRef main_function;
        LLVMBasicBlockRef continue_block;
@@ -169,6 +170,17 @@ static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
                                  LLVMIntPredicate pred, LLVMValueRef src0,
                                  LLVMValueRef src1)
 {
+       LLVMTypeRef src0_type = LLVMTypeOf(src0);
+       LLVMTypeRef src1_type = LLVMTypeOf(src1);
+
+       if (LLVMGetTypeKind(src0_type) == LLVMPointerTypeKind &&
+           LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) {
+               src1 = LLVMBuildIntToPtr(ctx->builder, src1, src0_type, "");
+       } else if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind &&
+                  LLVMGetTypeKind(src0_type) != LLVMPointerTypeKind) {
+               src0 = LLVMBuildIntToPtr(ctx->builder, src0, src1_type, "");
+       }
+
        LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
        return LLVMBuildSelect(ctx->builder, result,
                               LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
@@ -193,30 +205,59 @@ static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
                                         LLVMTypeRef result_type,
                                         LLVMValueRef src0)
 {
-       char name[64];
+       char name[64], type[64];
        LLVMValueRef params[] = {
                ac_to_float(ctx, src0),
        };
 
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
        assert(length < sizeof(name));
        return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
 }
 
+static LLVMValueRef emit_intrin_1f_param_scalar(struct ac_llvm_context *ctx,
+                                               const char *intrin,
+                                               LLVMTypeRef result_type,
+                                               LLVMValueRef src0)
+{
+       if (LLVMGetTypeKind(result_type) != LLVMVectorTypeKind)
+               return emit_intrin_1f_param(ctx, intrin, result_type, src0);
+
+       LLVMTypeRef elem_type = LLVMGetElementType(result_type);
+       LLVMValueRef ret = LLVMGetUndef(result_type);
+
+       /* Scalarize the intrinsic, because vectors are not supported. */
+       for (unsigned i = 0; i < LLVMGetVectorSize(result_type); i++) {
+               char name[64], type[64];
+               LLVMValueRef params[] = {
+                       ac_to_float(ctx, ac_llvm_extract_elem(ctx, src0, i)),
+               };
+
+               ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+               ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
+               assert(length < sizeof(name));
+               ret = LLVMBuildInsertElement(ctx->builder, ret,
+                                            ac_build_intrinsic(ctx, name, elem_type, params,
+                                                               1, AC_FUNC_ATTR_READNONE),
+                                            LLVMConstInt(ctx->i32, i, 0), "");
+       }
+       return ret;
+}
+
 static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
                                       const char *intrin,
                                       LLVMTypeRef result_type,
                                       LLVMValueRef src0, LLVMValueRef src1)
 {
-       char name[64];
+       char name[64], type[64];
        LLVMValueRef params[] = {
                ac_to_float(ctx, src0),
                ac_to_float(ctx, src1),
        };
 
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
        assert(length < sizeof(name));
        return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
 }
@@ -226,15 +267,15 @@ static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
                                         LLVMTypeRef result_type,
                                         LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
 {
-       char name[64];
+       char name[64], type[64];
        LLVMValueRef params[] = {
                ac_to_float(ctx, src0),
                ac_to_float(ctx, src1),
                ac_to_float(ctx, src2),
        };
 
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
        assert(length < sizeof(name));
        return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
 }
@@ -589,10 +630,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
        unsigned num_components = instr->dest.dest.ssa.num_components;
        unsigned src_components;
        LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
-       bool saved_inexact = false;
-
-       if (instr->exact)
-               saved_inexact = ac_disable_inexact_math(ctx->ac.builder);
 
        assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
        switch (instr->op) {
@@ -692,8 +729,18 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
                break;
        case nir_op_frcp:
-               result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rcp",
-                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
+               /* For doubles, we need precise division to pass GLCTS. */
+               if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL &&
+                   ac_get_type_size(def_type) == 8) {
+                       result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1,
+                                              ac_to_float(&ctx->ac, src[0]), "");
+               } else {
+                       result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rcp",
+                                                            ac_to_float_type(&ctx->ac, def_type), src[0]);
+               }
+               if (ctx->abi->clamp_div_by_zero)
+                       result = ac_build_fmin(&ctx->ac, result,
+                                              LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
                break;
        case nir_op_iand:
                result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
@@ -752,7 +799,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
        case nir_op_feq32:
                result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]);
                break;
-       case nir_op_fne32:
+       case nir_op_fneu32:
                result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
                break;
        case nir_op_flt32:
@@ -840,6 +887,9 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
        case nir_op_frsq:
                result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rsq",
                                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+               if (ctx->abi->clamp_div_by_zero)
+                       result = ac_build_fmin(&ctx->ac, result,
+                                              LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
                break;
        case nir_op_frexp_exp:
                src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -881,7 +931,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
        case nir_op_ffma:
                /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
                result = emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd",
-                                             ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
+                                             ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
                break;
        case nir_op_ldexp:
                src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -942,15 +992,45 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
                break;
        case nir_op_f2f16_rtz:
+       case nir_op_f2f16:
+       case nir_op_f2fmp:
                src[0] = ac_to_float(&ctx->ac, src[0]);
-               if (LLVMTypeOf(src[0]) == ctx->ac.f64)
-                       src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
-               LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
-               result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-               result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+
+               /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it,
+                * all f32->f16 conversions have to round towards zero, because both scalar
+                * and vec2 down-conversions have to round equally.
+                */
+               if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL ||
+                   instr->op == nir_op_f2f16_rtz) {
+                       src[0] = ac_to_float(&ctx->ac, src[0]);
+
+                       if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+                               src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
+
+                       /* Fast path conversion. This only works if NIR is vectorized
+                        * to vec2 16.
+                        */
+                       if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
+                               LLVMValueRef args[] = {
+                                       ac_llvm_extract_elem(&ctx->ac, src[0], 0),
+                                       ac_llvm_extract_elem(&ctx->ac, src[0], 1),
+                               };
+                               result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
+                               break;
+                       }
+
+                       assert(ac_get_llvm_num_components(src[0]) == 1);
+                       LLVMValueRef param[2] = { src[0], LLVMGetUndef(ctx->ac.f32) };
+                       result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+                       result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+               } else {
+                       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+                               result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+                       else
+                               result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+               }
                break;
        case nir_op_f2f16_rtne:
-       case nir_op_f2f16:
        case nir_op_f2f32:
        case nir_op_f2f64:
                src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -961,6 +1041,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                break;
        case nir_op_u2u8:
        case nir_op_u2u16:
+       case nir_op_u2ump:
        case nir_op_u2u32:
        case nir_op_u2u64:
                if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -970,6 +1051,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                break;
        case nir_op_i2i8:
        case nir_op_i2i16:
+       case nir_op_i2imp:
        case nir_op_i2i32:
        case nir_op_i2i64:
                if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -1123,57 +1205,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                break;
        }
 
-       case nir_op_fmin3:
-               result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
-                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
-               result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
-                                               ac_to_float_type(&ctx->ac, def_type), result, src[2]);
-               break;
-       case nir_op_umin3:
-               result = ac_build_umin(&ctx->ac, src[0], src[1]);
-               result = ac_build_umin(&ctx->ac, result, src[2]);
-               break;
-       case nir_op_imin3:
-               result = ac_build_imin(&ctx->ac, src[0], src[1]);
-               result = ac_build_imin(&ctx->ac, result, src[2]);
-               break;
-       case nir_op_fmax3:
-               result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
-                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
-               result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
-                                               ac_to_float_type(&ctx->ac, def_type), result, src[2]);
-               break;
-       case nir_op_umax3:
-               result = ac_build_umax(&ctx->ac, src[0], src[1]);
-               result = ac_build_umax(&ctx->ac, result, src[2]);
-               break;
-       case nir_op_imax3:
-               result = ac_build_imax(&ctx->ac, src[0], src[1]);
-               result = ac_build_imax(&ctx->ac, result, src[2]);
-               break;
-       case nir_op_fmed3: {
-               src[0] = ac_to_float(&ctx->ac, src[0]);
-               src[1] = ac_to_float(&ctx->ac, src[1]);
-               src[2] = ac_to_float(&ctx->ac, src[2]);
-               result = ac_build_fmed3(&ctx->ac, src[0], src[1], src[2],
-                                       instr->dest.dest.ssa.bit_size);
-               break;
-       }
-       case nir_op_imed3: {
-               LLVMValueRef tmp1 = ac_build_imin(&ctx->ac, src[0], src[1]);
-               LLVMValueRef tmp2 = ac_build_imax(&ctx->ac, src[0], src[1]);
-               tmp2 = ac_build_imin(&ctx->ac, tmp2, src[2]);
-               result = ac_build_imax(&ctx->ac, tmp1, tmp2);
-               break;
-       }
-       case nir_op_umed3: {
-               LLVMValueRef tmp1 = ac_build_umin(&ctx->ac, src[0], src[1]);
-               LLVMValueRef tmp2 = ac_build_umax(&ctx->ac, src[0], src[1]);
-               tmp2 = ac_build_umin(&ctx->ac, tmp2, src[2]);
-               result = ac_build_umax(&ctx->ac, tmp1, tmp2);
-               break;
-       }
-
        default:
                fprintf(stderr, "Unknown NIR alu instr: ");
                nir_print_instr(&instr->instr, stderr);
@@ -1186,9 +1217,6 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = ac_to_integer_or_pointer(&ctx->ac, result);
                ctx->ssa_defs[instr->dest.dest.ssa.index] = result;
        }
-
-       if (instr->exact)
-               ac_restore_inexact_math(ctx->ac.builder, saved_inexact);
 }
 
 static void visit_load_const(struct ac_nir_context *ctx,
@@ -1435,12 +1463,14 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
                unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
 
+               assert(instr->dest.is_ssa);
                return ac_build_buffer_load_format(&ctx->ac,
                                                   args->resource,
                                                   args->coords[0],
                                                   ctx->ac.i32_0,
                                                   util_last_bit(mask),
-                                                  0, true);
+                                                  0, true,
+                                                  instr->dest.ssa.bit_size == 16);
        }
 
        args->opcode = ac_image_sample;
@@ -1469,7 +1499,8 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
                break;
        case nir_texop_tg4:
                args->opcode = ac_image_gather4;
-               args->level_zero = true;
+                if (!args->lod && !args->bias)
+                       args->level_zero = true;
                break;
        case nir_texop_lod:
                args->opcode = ac_image_get_lod;
@@ -1572,13 +1603,13 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
 
        if (instr->dest.ssa.bit_size == 8) {
                unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
-               LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords);
                ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
                LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
 
                LLVMValueRef params[3];
                if (load_dwords > 1) {
-                       LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+                       LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, "");
                        params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
                        params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
                } else {
@@ -1591,11 +1622,11 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
 
                res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
                if (instr->dest.ssa.num_components > 1)
-                       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), "");
+                       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), "");
                return res;
        } else if (instr->dest.ssa.bit_size == 16) {
                unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
-               LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords);
                ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
                LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
                res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
@@ -1679,7 +1710,7 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
        }
 
        if (access & ACCESS_STREAM_CACHE_POLICY)
-               cache_policy |= ac_slc;
+               cache_policy |= ac_slc | ac_glc;
 
        return cache_policy;
 }
@@ -2260,6 +2291,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 
        switch (mode) {
        case nir_var_shader_in:
+               /* TODO: remove this after RADV switches to lowered IO */
                if (ctx->stage == MESA_SHADER_TESS_CTRL ||
                    ctx->stage == MESA_SHADER_TESS_EVAL) {
                        return load_tess_varyings(ctx, instr, true);
@@ -2315,6 +2347,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
                }
                break;
        case nir_var_shader_out:
+               /* TODO: remove this after RADV switches to lowered IO */
                if (ctx->stage == MESA_SHADER_TESS_CTRL) {
                        return load_tess_varyings(ctx, instr, false);
                }
@@ -2364,6 +2397,9 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
                                LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0);
                                values[i] = LLVMBuildLoad(ctx->ac.builder,
                                                          ac_build_gep_ptr(&ctx->ac, address, offset), "");
+
+                               if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                                       LLVMSetOrdering(values[i], LLVMAtomicOrderingMonotonic);
                        }
                        return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components);
                } else {
@@ -2371,6 +2407,9 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
                                                                LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                        address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
                        LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
+
+                       if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                               LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
                        return val;
                }
        }
@@ -2429,7 +2468,7 @@ visit_store_var(struct ac_nir_context *ctx,
 
        switch (deref->mode) {
        case nir_var_shader_out:
-
+               /* TODO: remove this after RADV switches to lowered IO */
                if (ctx->stage == MESA_SHADER_TESS_CTRL) {
                        LLVMValueRef vertex_index = NULL;
                        LLVMValueRef indir_index = NULL;
@@ -2444,7 +2483,9 @@ visit_store_var(struct ac_nir_context *ctx,
 
                        ctx->abi->store_tcs_outputs(ctx->abi, var,
                                                    vertex_index, indir_index,
-                                                   const_index, src, writemask);
+                                                   const_index, src, writemask,
+                                                   var->data.location_frac,
+                                                   var->data.driver_location);
                        break;
                }
 
@@ -2526,7 +2567,10 @@ visit_store_var(struct ac_nir_context *ctx,
 
                        val = LLVMBuildBitCast(ctx->ac.builder, val,
                                               LLVMGetElementType(LLVMTypeOf(address)), "");
-                       LLVMBuildStore(ctx->ac.builder, val, address);
+                       LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, val, address);
+
+                       if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                               LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
                } else {
                        LLVMTypeRef val_type = LLVMTypeOf(val);
                        if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind)
@@ -2546,7 +2590,10 @@ visit_store_var(struct ac_nir_context *ctx,
                                                                        chan);
                                src = LLVMBuildBitCast(ctx->ac.builder, src,
                                                       LLVMGetElementType(LLVMTypeOf(ptr)), "");
-                               LLVMBuildStore(ctx->ac.builder, src, ptr);
+                               LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, src, ptr);
+
+                               if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
+                                       LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
                        }
                }
                break;
@@ -2560,6 +2607,71 @@ visit_store_var(struct ac_nir_context *ctx,
                ac_build_endif(&ctx->ac, 7002);
 }
 
+static void
+visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
+{
+       if (ctx->ac.postponed_kill) {
+               LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
+                                                  ctx->ac.postponed_kill, "");
+               ac_build_ifcc(&ctx->ac, cond, 7002);
+       }
+
+       unsigned base = nir_intrinsic_base(instr);
+       unsigned writemask = nir_intrinsic_write_mask(instr);
+       unsigned component = nir_intrinsic_component(instr);
+       LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
+       nir_src offset = *nir_get_io_offset_src(instr);
+       LLVMValueRef indir_index = NULL;
+
+       if (nir_src_is_const(offset))
+               assert(nir_src_as_uint(offset) == 0);
+       else
+               indir_index = get_src(ctx, offset);
+
+       switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) {
+       case 32:
+               break;
+       case 64:
+               writemask = widen_mask(writemask, 2);
+               src = LLVMBuildBitCast(ctx->ac.builder, src,
+                                      LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2),
+                                      "");
+               break;
+       default:
+               unreachable("unhandled store_output bit size");
+               return;
+       }
+
+       writemask <<= component;
+
+       if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+               nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+               LLVMValueRef vertex_index =
+                               vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
+
+               ctx->abi->store_tcs_outputs(ctx->abi, NULL,
+                                           vertex_index, indir_index,
+                                           0, src, writemask,
+                                           component, base * 4);
+               return;
+       }
+
+       /* No indirect indexing is allowed after this point. */
+       assert(!indir_index);
+
+       for (unsigned chan = 0; chan < 8; chan++) {
+               if (!(writemask & (1 << chan)))
+                       continue;
+
+               LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+               LLVMBuildStore(ctx->ac.builder, value,
+                              ctx->abi->outputs[base * 4 + chan]);
+       }
+
+       if (ctx->ac.postponed_kill)
+               ac_build_endif(&ctx->ac, 7002);
+}
+
 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
 {
        switch (dim) {
@@ -2749,18 +2861,17 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
        LLVMValueRef res;
 
        enum glsl_sampler_dim dim;
-       enum gl_access_qualifier access;
+       enum gl_access_qualifier access = nir_intrinsic_access(instr);
        bool is_array;
        if (bindless) {
                dim = nir_intrinsic_image_dim(instr);
-               access = nir_intrinsic_access(instr);
                is_array = nir_intrinsic_image_array(instr);
        } else {
                const nir_deref_instr *image_deref = get_image_deref(instr);
                const struct glsl_type *type = image_deref->type;
                const nir_variable *var = nir_deref_instr_get_variable(image_deref);
                dim = glsl_get_sampler_dim(type);
-               access = var->data.access;
+               access |= var->data.access;
                is_array = glsl_sampler_type_is_array(type);
        }
 
@@ -2780,11 +2891,13 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                 ctx->ac.i32_0, "");
 
+               assert(instr->dest.is_ssa);
                bool can_speculate = access & ACCESS_CAN_REORDER;
                res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex,
                                                  ctx->ac.i32_0, num_channels,
                                                  args.cache_policy,
-                                                 can_speculate);
+                                                 can_speculate,
+                                                 instr->dest.ssa.bit_size == 16);
                res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
 
                res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
@@ -2801,6 +2914,9 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                args.dmask = 15;
                args.attributes = AC_FUNC_ATTR_READONLY;
 
+               assert(instr->dest.is_ssa);
+               args.d16 = instr->dest.ssa.bit_size == 16;
+
                res = ac_build_image_opcode(&ctx->ac, &args);
        }
        return exit_waterfall(ctx, &wctx, res);
@@ -2817,19 +2933,18 @@ static void visit_image_store(struct ac_nir_context *ctx,
         }
 
        enum glsl_sampler_dim dim;
-       enum gl_access_qualifier access;
+       enum gl_access_qualifier access = nir_intrinsic_access(instr);
        bool is_array;
 
        if (bindless) {
                dim = nir_intrinsic_image_dim(instr);
-               access = nir_intrinsic_access(instr);
                is_array = nir_intrinsic_image_array(instr);
        } else {
                const nir_deref_instr *image_deref = get_image_deref(instr);
                const struct glsl_type *type = image_deref->type;
                const nir_variable *var = nir_deref_instr_get_variable(image_deref);
                dim = glsl_get_sampler_dim(type);
-               access = var->data.access;
+               access |= var->data.access;
                is_array = glsl_sampler_type_is_array(type);
        }
 
@@ -2855,8 +2970,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
                                                 ctx->ac.i32_0, "");
 
                ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex,
-                                            ctx->ac.i32_0, src_channels,
-                                            args.cache_policy);
+                                            ctx->ac.i32_0, args.cache_policy);
        } else {
                bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
 
@@ -2868,6 +2982,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
                if (!level_zero)
                        args.lod = get_src(ctx, instr->src[4]);
                args.dmask = 15;
+               args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16;
 
                ac_build_image_opcode(&ctx->ac, &args);
        }
@@ -2973,16 +3088,6 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
        case nir_intrinsic_image_deref_atomic_inc_wrap: {
                atomic_name = "inc";
                atomic_subop = ac_atomic_inc_wrap;
-               /* ATOMIC_INC instruction does:
-                *      value = (value + 1) % (data + 1)
-                * but we want:
-                *      value = (value + 1) % data
-                * So replace 'data' by 'data - 1'.
-                */
-               ctx->ssa_defs[instr->src[3].ssa->index] =
-                       LLVMBuildSub(ctx->ac.builder,
-                                    ctx->ssa_defs[instr->src[3].ssa->index],
-                                    ctx->ac.i32_1, "");
                break;
        }
        case nir_intrinsic_bindless_image_atomic_dec_wrap:
@@ -3085,6 +3190,7 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
                args.dmask = 0xf;
                args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
                args.opcode = ac_image_get_resinfo;
+               assert(nir_src_as_uint(instr->src[1]) == 0);
                args.lod = ctx->ac.i32_0;
                args.attributes = AC_FUNC_ATTR_READNONE;
 
@@ -3355,11 +3461,26 @@ static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx,
                case nir_intrinsic_deref_atomic_exchange:
                        op = LLVMAtomicRMWBinOpXchg;
                        break;
+#if LLVM_VERSION_MAJOR >= 10
+               case nir_intrinsic_shared_atomic_fadd:
+               case nir_intrinsic_deref_atomic_fadd:
+                       op = LLVMAtomicRMWBinOpFAdd;
+                       break;
+#endif
                default:
                        return NULL;
                }
 
-               result = ac_build_atomic_rmw(&ctx->ac, op, ptr, ac_to_integer(&ctx->ac, src), sync_scope);
+               LLVMValueRef val;
+
+               if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd ||
+                   instr->intrinsic == nir_intrinsic_deref_atomic_fadd) {
+                       val = ac_to_float(&ctx->ac, src);
+               } else {
+                       val = ac_to_integer(&ctx->ac, src);
+               }
+
+               result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope);
        }
 
        if (ctx->ac.postponed_kill)
@@ -3511,13 +3632,26 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
                                            unsigned bitsize)
 {
        LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+        LLVMValueRef interp_param_f;
 
-       interp_param = LLVMBuildBitCast(ctx->ac.builder,
+       interp_param_f = LLVMBuildBitCast(ctx->ac.builder,
                                interp_param, ctx->ac.v2f32, "");
        LLVMValueRef i = LLVMBuildExtractElement(
-               ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+               ctx->ac.builder, interp_param_f, ctx->ac.i32_0, "");
        LLVMValueRef j = LLVMBuildExtractElement(
-               ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+               ctx->ac.builder, interp_param_f, ctx->ac.i32_1, "");
+
+       /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
+       if (ctx->verified_interp &&
+            !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
+               LLVMValueRef args[2];
+               args[0] = i;
+               args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
+               LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
+                                                       args, 2, AC_FUNC_ATTR_READNONE);
+               ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
+                _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
+       }
 
        LLVMValueRef values[4];
        assert(bitsize == 16 || bitsize == 32);
@@ -3535,18 +3669,82 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
        return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components));
 }
 
-static LLVMValueRef load_input(struct ac_nir_context *ctx,
-                              nir_intrinsic_instr *instr)
+static LLVMValueRef visit_load(struct ac_nir_context *ctx,
+                              nir_intrinsic_instr *instr, bool is_output)
 {
-       unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
+       LLVMValueRef values[8];
+       LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
+       LLVMTypeRef component_type;
+       unsigned base = nir_intrinsic_base(instr);
+       unsigned component = nir_intrinsic_component(instr);
+       unsigned count = instr->dest.ssa.num_components *
+                        (instr->dest.ssa.bit_size == 64 ? 2 : 1);
+       nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+       LLVMValueRef vertex_index =
+               vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
+       nir_src offset = *nir_get_io_offset_src(instr);
+       LLVMValueRef indir_index = NULL;
+
+       if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind)
+               component_type = LLVMGetElementType(dest_type);
+       else
+               component_type = dest_type;
 
-       /* We only lower inputs for fragment shaders ATM */
-       ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[offset_idx]);
-       assert(offset);
-       assert(offset[0].i32 == 0);
+       if (nir_src_is_const(offset))
+               assert(nir_src_as_uint(offset) == 0);
+       else
+               indir_index = get_src(ctx, offset);
+
+       if (ctx->stage == MESA_SHADER_TESS_CTRL ||
+           (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) {
+               LLVMValueRef result =
+                       ctx->abi->load_tess_varyings(ctx->abi, component_type,
+                                                    vertex_index, indir_index,
+                                                    0, 0, base * 4,
+                                                    component,
+                                                    instr->num_components,
+                                                    false, false, !is_output);
+               if (instr->dest.ssa.bit_size == 16) {
+                       result = ac_to_integer(&ctx->ac, result);
+                       result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
+               }
+               return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
+       }
 
-       unsigned component = nir_intrinsic_component(instr);
-       unsigned index = nir_intrinsic_base(instr);
+       /* No indirect indexing is allowed after this point. */
+       assert(!indir_index);
+
+       if (ctx->stage == MESA_SHADER_GEOMETRY) {
+               LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
+               assert(nir_src_is_const(*vertex_index_src));
+
+               return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component,
+                                            instr->num_components,
+                                            nir_src_as_uint(*vertex_index_src),
+                                            0, type);
+       }
+
+       if (ctx->stage == MESA_SHADER_FRAGMENT && is_output &&
+           nir_intrinsic_io_semantics(instr).fb_fetch_output)
+               return ctx->abi->emit_fbfetch(ctx->abi);
+
+       /* Other non-fragment cases have inputs and outputs in temporaries. */
+       if (ctx->stage != MESA_SHADER_FRAGMENT) {
+               for (unsigned chan = component; chan < count + component; chan++) {
+                       if (is_output) {
+                               values[chan] = LLVMBuildLoad(ctx->ac.builder,
+                                                            ctx->abi->outputs[base * 4 + chan], "");
+                       } else {
+                               values[chan] = ctx->abi->inputs[base * 4 + chan];
+                               if (!values[chan])
+                                       values[chan] = LLVMGetUndef(ctx->ac.i32);
+                       }
+               }
+               LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component);
+               return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
+       }
+
+       /* Fragment shader inputs. */
        unsigned vertex_id = 2; /* P0 */
 
        if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
@@ -3567,18 +3765,11 @@ static LLVMValueRef load_input(struct ac_nir_context *ctx,
                }
        }
 
-       LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
-       LLVMValueRef values[8];
-
-       /* Each component of a 64-bit value takes up two GL-level channels. */
-       unsigned num_components = instr->dest.ssa.num_components;
-       unsigned bit_size = instr->dest.ssa.bit_size;
-       unsigned channels =
-               bit_size == 64 ? num_components * 2 : num_components;
+       LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false);
 
-       for (unsigned chan = 0; chan < channels; chan++) {
+       for (unsigned chan = 0; chan < count; chan++) {
                if (component + chan > 4)
-                       attr_number = LLVMConstInt(ctx->ac.i32, index + 1, false);
+                       attr_number = LLVMConstInt(ctx->ac.i32, base + 1, false);
                LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false);
                values[chan] = ac_build_fs_interp_mov(&ctx->ac,
                                                      LLVMConstInt(ctx->ac.i32, vertex_id, false),
@@ -3587,16 +3778,12 @@ static LLVMValueRef load_input(struct ac_nir_context *ctx,
                                                      ac_get_arg(&ctx->ac, ctx->args->prim_mask));
                values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, "");
                values[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan],
-                                                      bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, "");
+                                                      instr->dest.ssa.bit_size == 16 ? ctx->ac.i16
+                                                                                     : ctx->ac.i32, "");
        }
 
-       LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, channels);
-       if (bit_size == 64) {
-               LLVMTypeRef type = num_components == 1 ? ctx->ac.i64 :
-                       LLVMVectorType(ctx->ac.i64, num_components);
-               result = LLVMBuildBitCast(ctx->ac.builder, result, type, "");
-       }
-       return result;
+       LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, count);
+       return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
 }
 
 static void visit_intrinsic(struct ac_nir_context *ctx,
@@ -3793,6 +3980,19 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        case nir_intrinsic_store_deref:
                visit_store_var(ctx, instr);
                break;
+       case nir_intrinsic_load_input:
+       case nir_intrinsic_load_input_vertex:
+       case nir_intrinsic_load_per_vertex_input:
+               result = visit_load(ctx, instr, false);
+               break;
+       case nir_intrinsic_load_output:
+       case nir_intrinsic_load_per_vertex_output:
+               result = visit_load(ctx, instr, true);
+               break;
+       case nir_intrinsic_store_output:
+       case nir_intrinsic_store_per_vertex_output:
+               visit_store_output(ctx, instr);
+               break;
        case nir_intrinsic_load_shared:
                result = visit_load_shared(ctx, instr);
                break;
@@ -3850,7 +4050,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                result = visit_image_size(ctx, instr, false);
                break;
        case nir_intrinsic_shader_clock:
-               result = ac_build_shader_clock(&ctx->ac);
+               result = ac_build_shader_clock(&ctx->ac,
+                                              nir_intrinsic_memory_scope(instr));
                break;
        case nir_intrinsic_discard:
        case nir_intrinsic_discard_if:
@@ -3867,6 +4068,25 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        case nir_intrinsic_memory_barrier_shared:
                emit_membar(&ctx->ac, instr);
                break;
+       case nir_intrinsic_scoped_barrier: {
+               assert(!(nir_intrinsic_memory_semantics(instr) &
+                        (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+               nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
+
+               unsigned wait_flags = 0;
+               if (modes & (nir_var_mem_global | nir_var_mem_ssbo))
+                       wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
+               if (modes & nir_var_mem_shared)
+                       wait_flags |= AC_WAIT_LGKM;
+
+               if (wait_flags)
+                       ac_build_waitcnt(&ctx->ac, wait_flags);
+
+               if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP)
+                       ac_emit_barrier(&ctx->ac, ctx->stage);
+               break;
+       }
        case nir_intrinsic_memory_barrier_tcs_patch:
                break;
        case nir_intrinsic_control_barrier:
@@ -3881,7 +4101,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        case nir_intrinsic_shared_atomic_or:
        case nir_intrinsic_shared_atomic_xor:
        case nir_intrinsic_shared_atomic_exchange:
-       case nir_intrinsic_shared_atomic_comp_swap: {
+       case nir_intrinsic_shared_atomic_comp_swap:
+       case nir_intrinsic_shared_atomic_fadd: {
                LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0],
                                                  instr->src[1].ssa->bit_size);
                result = visit_var_atomic(ctx, instr, ptr, 1);
@@ -3896,7 +4117,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        case nir_intrinsic_deref_atomic_or:
        case nir_intrinsic_deref_atomic_xor:
        case nir_intrinsic_deref_atomic_exchange:
-       case nir_intrinsic_deref_atomic_comp_swap: {
+       case nir_intrinsic_deref_atomic_comp_swap:
+       case nir_intrinsic_deref_atomic_fadd: {
                LLVMValueRef ptr = get_src(ctx, instr->src[0]);
                result = visit_var_atomic(ctx, instr, ptr, 1);
                break;
@@ -3938,10 +4160,6 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                                                 instr->dest.ssa.bit_size);
                break;
        }
-       case nir_intrinsic_load_input:
-       case nir_intrinsic_load_input_vertex:
-               result = load_input(ctx, instr);
-               break;
        case nir_intrinsic_emit_vertex:
                ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
                break;
@@ -3988,7 +4206,7 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        case nir_intrinsic_shuffle:
                if (ctx->ac.chip_class == GFX8 ||
                    ctx->ac.chip_class == GFX9 ||
-                   (ctx->ac.chip_class == GFX10 && ctx->ac.wave_size == 32)) {
+                   (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) {
                        result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
                                                  get_src(ctx, instr->src[1]));
                } else {
@@ -4433,8 +4651,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                        offset_src = i;
                        break;
                case nir_tex_src_bias:
-                       if (instr->op == nir_texop_txb)
-                               args.bias = get_src(ctx, instr->src[i].src);
+                       args.bias = get_src(ctx, instr->src[i].src);
                        break;
                case nir_tex_src_lod: {
                        if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
@@ -4454,6 +4671,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                case nir_tex_src_ddy:
                        ddy = get_src(ctx, instr->src[i].src);
                        break;
+               case nir_tex_src_min_lod:
+                       args.min_lod = get_src(ctx, instr->src[i].src);
+                       break;
                case nir_tex_src_texture_offset:
                case nir_tex_src_sampler_offset:
                case nir_tex_src_plane:
@@ -4469,6 +4689,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 
        if (instr->op == nir_texop_texture_samples) {
                LLVMValueRef res, samples, is_msaa;
+               LLVMValueRef default_sample;
+
                res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
                samples = LLVMBuildExtractElement(ctx->ac.builder, res,
                                                  LLVMConstInt(ctx->ac.i32, 3, false), "");
@@ -4485,8 +4707,27 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                                       LLVMConstInt(ctx->ac.i32, 0xf, false), "");
                samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
                                       samples, "");
+
+               if (ctx->abi->robust_buffer_access) {
+                       LLVMValueRef dword1, is_null_descriptor;
+
+                       /* Extract the second dword of the descriptor, if it's
+                        * all zero, then it's a null descriptor.
+                        */
+                       dword1 = LLVMBuildExtractElement(ctx->ac.builder, res,
+                                                        LLVMConstInt(ctx->ac.i32, 1, false), "");
+                       is_null_descriptor =
+                               LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
+                                             LLVMConstInt(ctx->ac.i32, 0, false), "");
+                       default_sample =
+                               LLVMBuildSelect(ctx->ac.builder, is_null_descriptor,
+                                               ctx->ac.i32_0, ctx->ac.i32_1, "");
+               } else {
+                       default_sample = ctx->ac.i32_1;
+               }
+
                samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
-                                         ctx->ac.i32_1, "");
+                                         default_sample, "");
                result = samples;
                goto write_result;
        }
@@ -4694,6 +4935,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                }
        }
 
+       assert(instr->dest.is_ssa);
+       args.d16 = instr->dest.ssa.bit_size == 16;
+
        result = build_tex_intrinsic(ctx, instr, &args);
 
        if (instr->op == nir_texop_query_levels)
@@ -4764,19 +5008,46 @@ static void phi_post_pass(struct ac_nir_context *ctx)
 }
 
 
+static bool is_def_used_in_an_export(const nir_ssa_def* def) {
+       nir_foreach_use(use_src, def) {
+               if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
+                       nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr);
+                       if (instr->intrinsic == nir_intrinsic_store_deref)
+                               return true;
+               } else if (use_src->parent_instr->type == nir_instr_type_alu) {
+                       nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr);
+                       if (instr->op == nir_op_vec4 &&
+                           is_def_used_in_an_export(&instr->dest.dest.ssa)) {
+                               return true;
+                       }
+               }
+       }
+       return false;
+}
+
 static void visit_ssa_undef(struct ac_nir_context *ctx,
                            const nir_ssa_undef_instr *instr)
 {
        unsigned num_components = instr->def.num_components;
        LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
-       LLVMValueRef undef;
 
-       if (num_components == 1)
-               undef = LLVMGetUndef(type);
-       else {
-               undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+       if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) {
+               LLVMValueRef undef;
+
+               if (num_components == 1)
+                       undef = LLVMGetUndef(type);
+               else {
+                       undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+               }
+               ctx->ssa_defs[instr->def.index] = undef;
+       } else {
+               LLVMValueRef zero = LLVMConstInt(type, 0, false);
+               if (num_components > 1) {
+                       zero = ac_build_gather_values_extended(
+                               &ctx->ac, &zero, 4, 0, false, false);
+               }
+               ctx->ssa_defs[instr->def.index] = zero;
        }
-       ctx->ssa_defs[instr->def.index] = undef;
 }
 
 static void visit_jump(struct ac_llvm_context *ctx,
@@ -4918,7 +5189,7 @@ static void visit_deref(struct ac_nir_context *ctx,
                break;
        case nir_deref_type_ptr_as_array:
                if (instr->mode == nir_var_mem_global) {
-                       unsigned stride = nir_deref_instr_ptr_as_array_stride(instr);
+                       unsigned stride = nir_deref_instr_array_stride(instr);
 
                        LLVMValueRef index = get_src(ctx, instr->arr.index);
                        if (LLVMTypeOf(index) != ctx->ac.i64)
@@ -5121,7 +5392,7 @@ setup_locals(struct ac_nir_context *ctx,
 {
        int i, j;
        ctx->num_locals = 0;
-       nir_foreach_variable(variable, &func->impl->locals) {
+       nir_foreach_function_temp_variable(variable, func->impl) {
                unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
                variable->data.driver_location = ctx->num_locals * 4;
                variable->data.location_frac = 0;
@@ -5221,9 +5492,13 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
 
        ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 
-       nir_foreach_variable(variable, &nir->outputs)
-               ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
-                                            ctx.stage);
+       /* TODO: remove this after RADV switches to lowered IO */
+       if (!nir->info.io_lowered) {
+               nir_foreach_shader_out_variable(variable, nir) {
+                       ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
+                                                    ctx.stage);
+               }
+       }
 
        ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                           _mesa_key_pointer_equal);
@@ -5232,6 +5507,10 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
        ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                           _mesa_key_pointer_equal);
 
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ctx.verified_interp = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                                              _mesa_key_pointer_equal);
+
        func = (struct nir_function *)exec_list_get_head(&nir->functions);
 
        nir_index_ssa_defs(func->impl);
@@ -5266,6 +5545,8 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
        ralloc_free(ctx.defs);
        ralloc_free(ctx.phis);
        ralloc_free(ctx.vars);
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ralloc_free(ctx.verified_interp);
 }
 
 bool
@@ -5312,33 +5593,26 @@ ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class)
         */
        indirect_mask |= nir_var_function_temp;
 
-       progress |= nir_lower_indirect_derefs(nir, indirect_mask);
+       progress |= nir_lower_indirect_derefs(nir, indirect_mask, UINT32_MAX);
        return progress;
 }
 
 static unsigned
 get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin)
 {
-       if (intrin->intrinsic != nir_intrinsic_store_deref)
+       if (intrin->intrinsic != nir_intrinsic_store_output)
                return 0;
 
-       nir_variable *var =
-               nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
-
-       if (var->data.mode != nir_var_shader_out)
-               return 0;
+       unsigned writemask = nir_intrinsic_write_mask(intrin) <<
+                            nir_intrinsic_component(intrin);
+       unsigned location = nir_intrinsic_io_semantics(intrin).location;
 
-       unsigned writemask = 0;
-       const int location = var->data.location;
-       unsigned first_component = var->data.location_frac;
-       unsigned num_comps = intrin->dest.ssa.num_components;
+       if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+               return writemask << 4;
+       else if (location == VARYING_SLOT_TESS_LEVEL_INNER)
+               return writemask;
 
-       if (location == VARYING_SLOT_TESS_LEVEL_INNER)
-               writemask = ((1 << (num_comps + 1)) - 1) << first_component;
-       else if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
-               writemask = (((1 << (num_comps + 1)) - 1) << first_component) << 4;
-
-       return writemask;
+       return 0;
 }
 
 static void