ac/nir: set the second v_cvt_pkrtz argument to undef if it's unused
[mesa.git] / src / amd / llvm / ac_nir_to_llvm.c
index fec68c469137bfce31eac300f0e51a76595eda36..cfdba201d4d421f4ba905e22897479e6100a56b0 100644 (file)
@@ -51,6 +51,7 @@ struct ac_nir_context {
        struct hash_table *defs;
        struct hash_table *phis;
        struct hash_table *vars;
+        struct hash_table *verified_interp;
 
        LLVMValueRef main_function;
        LLVMBasicBlockRef continue_block;
@@ -193,13 +194,13 @@ static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
                                         LLVMTypeRef result_type,
                                         LLVMValueRef src0)
 {
-       char name[64];
+       char name[64], type[64];
        LLVMValueRef params[] = {
                ac_to_float(ctx, src0),
        };
 
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
        assert(length < sizeof(name));
        return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
 }
@@ -209,14 +210,14 @@ static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
                                       LLVMTypeRef result_type,
                                       LLVMValueRef src0, LLVMValueRef src1)
 {
-       char name[64];
+       char name[64], type[64];
        LLVMValueRef params[] = {
                ac_to_float(ctx, src0),
                ac_to_float(ctx, src1),
        };
 
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
        assert(length < sizeof(name));
        return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
 }
@@ -226,15 +227,15 @@ static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
                                         LLVMTypeRef result_type,
                                         LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
 {
-       char name[64];
+       char name[64], type[64];
        LLVMValueRef params[] = {
                ac_to_float(ctx, src0),
                ac_to_float(ctx, src1),
                ac_to_float(ctx, src2),
        };
 
-       ASSERTED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
-                                                ac_get_elem_bits(ctx, result_type));
+       ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
+       ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
        assert(length < sizeof(name));
        return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
 }
@@ -589,6 +590,10 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
        unsigned num_components = instr->dest.dest.ssa.num_components;
        unsigned src_components;
        LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
+       bool saved_inexact = false;
+
+       if (instr->exact)
+               saved_inexact = ac_disable_inexact_math(ctx->ac.builder);
 
        assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
        switch (instr->op) {
@@ -688,8 +693,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
                break;
        case nir_op_frcp:
-               src[0] = ac_to_float(&ctx->ac, src[0]);
-               result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
+               result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rcp",
+                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
                break;
        case nir_op_iand:
                result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
@@ -834,9 +839,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                                              ac_to_float_type(&ctx->ac, def_type), src[0]);
                break;
        case nir_op_frsq:
-               result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
-                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
-               result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result);
+               result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rsq",
+                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
                break;
        case nir_op_frexp_exp:
                src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -939,15 +943,45 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
                break;
        case nir_op_f2f16_rtz:
+       case nir_op_f2f16:
+       case nir_op_f2fmp:
                src[0] = ac_to_float(&ctx->ac, src[0]);
-               if (LLVMTypeOf(src[0]) == ctx->ac.f64)
-                       src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
-               LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
-               result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-               result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+
+               /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it,
+                * all f32->f16 conversions have to round towards zero, because both scalar
+                * and vec2 down-conversions have to round equally.
+                */
+               if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL ||
+                   instr->op == nir_op_f2f16_rtz) {
+                       src[0] = ac_to_float(&ctx->ac, src[0]);
+
+                       if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+                               src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
+
+                       /* Fast path conversion. This only works if NIR is vectorized
+                        * to vec2 16.
+                        */
+                       if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
+                               LLVMValueRef args[] = {
+                                       ac_llvm_extract_elem(&ctx->ac, src[0], 0),
+                                       ac_llvm_extract_elem(&ctx->ac, src[0], 1),
+                               };
+                               result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
+                               break;
+                       }
+
+                       assert(ac_get_llvm_num_components(src[0]) == 1);
+                       LLVMValueRef param[2] = { src[0], LLVMGetUndef(ctx->ac.f32) };
+                       result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+                       result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+               } else {
+                       if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+                               result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+                       else
+                               result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+               }
                break;
        case nir_op_f2f16_rtne:
-       case nir_op_f2f16:
        case nir_op_f2f32:
        case nir_op_f2f64:
                src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -958,6 +992,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                break;
        case nir_op_u2u8:
        case nir_op_u2u16:
+       case nir_op_u2ump:
        case nir_op_u2u32:
        case nir_op_u2u64:
                if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -967,6 +1002,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                break;
        case nir_op_i2i8:
        case nir_op_i2i16:
+       case nir_op_i2imp:
        case nir_op_i2i32:
        case nir_op_i2i64:
                if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
@@ -1183,6 +1219,9 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = ac_to_integer_or_pointer(&ctx->ac, result);
                ctx->ssa_defs[instr->dest.dest.ssa.index] = result;
        }
+
+       if (instr->exact)
+               ac_restore_inexact_math(ctx->ac.builder, saved_inexact);
 }
 
 static void visit_load_const(struct ac_nir_context *ctx,
@@ -1429,12 +1468,14 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
                unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
 
+               assert(instr->dest.is_ssa);
                return ac_build_buffer_load_format(&ctx->ac,
                                                   args->resource,
                                                   args->coords[0],
                                                   ctx->ac.i32_0,
                                                   util_last_bit(mask),
-                                                  0, true);
+                                                  0, true,
+                                                  instr->dest.ssa.bit_size == 16);
        }
 
        args->opcode = ac_image_sample;
@@ -1463,7 +1504,8 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
                break;
        case nir_texop_tg4:
                args->opcode = ac_image_gather4;
-               args->level_zero = true;
+                if (!args->lod && !args->bias)
+                       args->level_zero = true;
                break;
        case nir_texop_lod:
                args->opcode = ac_image_get_lod;
@@ -1566,13 +1608,13 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
 
        if (instr->dest.ssa.bit_size == 8) {
                unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
-               LLVMTypeRef vec_type = LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords);
                ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
                LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
 
                LLVMValueRef params[3];
                if (load_dwords > 1) {
-                       LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+                       LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, "");
                        params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
                        params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
                } else {
@@ -1585,11 +1627,11 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx,
 
                res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
                if (instr->dest.ssa.num_components > 1)
-                       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), instr->dest.ssa.num_components), "");
+                       res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), "");
                return res;
        } else if (instr->dest.ssa.bit_size == 16) {
                unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
-               LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords);
                ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
                LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
                res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
@@ -1673,7 +1715,7 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
        }
 
        if (access & ACCESS_STREAM_CACHE_POLICY)
-               cache_policy |= ac_slc;
+               cache_policy |= ac_slc | ac_glc;
 
        return cache_policy;
 }
@@ -1741,6 +1783,16 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
                        count = 1;
                        num_bytes = 2;
                }
+
+               /* Due to alignment issues, split stores of 8-bit/16-bit
+                * vectors.
+                */
+               if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) {
+                       writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
+                       count = 1;
+                       num_bytes = elem_size_bytes;
+               }
+
                data = extract_vector_range(&ctx->ac, base_data, start, count);
 
                offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
@@ -2329,14 +2381,19 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
                break;
        case nir_var_mem_global:  {
                LLVMValueRef address = get_src(ctx, instr->src[0]);
+               LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
                unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
                unsigned natural_stride = type_scalar_size_bytes(deref->type);
                unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+               int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8;
+               bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
 
-               LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
-               if (stride != natural_stride) {
-                       LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(result_type),
-                                                               LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+               if (stride != natural_stride || split_loads) {
+                       if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind)
+                               result_type = LLVMGetElementType(result_type);
+
+                       LLVMTypeRef ptr_type = LLVMPointerType(result_type,
+                                                              LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                        address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 
                        for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) {
@@ -2490,23 +2547,29 @@ visit_store_var(struct ac_nir_context *ctx,
                unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
                unsigned natural_stride = type_scalar_size_bytes(deref->type);
                unsigned stride = explicit_stride ? explicit_stride : natural_stride;
+               int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8;
+               bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
 
                LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
                                                        LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 
                if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 &&
-                   stride == natural_stride) {
-                       LLVMTypeRef ptr_type =  LLVMPointerType(LLVMTypeOf(val),
-                                                               LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+                   stride == natural_stride && !split_stores) {
+                       LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val),
+                                                              LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                        address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
 
                        val = LLVMBuildBitCast(ctx->ac.builder, val,
                                               LLVMGetElementType(LLVMTypeOf(address)), "");
                        LLVMBuildStore(ctx->ac.builder, val, address);
                } else {
-                       LLVMTypeRef ptr_type =  LLVMPointerType(LLVMGetElementType(LLVMTypeOf(val)),
-                                                               LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
+                       LLVMTypeRef val_type = LLVMTypeOf(val);
+                       if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind)
+                               val_type = LLVMGetElementType(val_type);
+
+                       LLVMTypeRef ptr_type = LLVMPointerType(val_type,
+                                                              LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
                        address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , "");
                        for (unsigned chan = 0; chan < 4; chan++) {
                                if (!(writemask & (1 << chan)))
@@ -2706,7 +2769,11 @@ static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
                                          struct waterfall_context *wctx,
                                          const nir_intrinsic_instr *instr)
 {
-       nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+       nir_deref_instr *deref_instr = NULL;
+
+       if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref)
+               deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+
        LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
        return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
 }
@@ -2749,11 +2816,13 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                 ctx->ac.i32_0, "");
 
+               assert(instr->dest.is_ssa);
                bool can_speculate = access & ACCESS_CAN_REORDER;
                res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex,
                                                  ctx->ac.i32_0, num_channels,
                                                  args.cache_policy,
-                                                 can_speculate);
+                                                 can_speculate,
+                                                 instr->dest.ssa.bit_size == 16);
                res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
 
                res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
@@ -2770,6 +2839,9 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                args.dmask = 15;
                args.attributes = AC_FUNC_ATTR_READONLY;
 
+               assert(instr->dest.is_ssa);
+               args.d16 = instr->dest.ssa.bit_size == 16;
+
                res = ac_build_image_opcode(&ctx->ac, &args);
        }
        return exit_waterfall(ctx, &wctx, res);
@@ -2824,8 +2896,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
                                                 ctx->ac.i32_0, "");
 
                ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex,
-                                            ctx->ac.i32_0, src_channels,
-                                            args.cache_policy);
+                                            ctx->ac.i32_0, args.cache_policy);
        } else {
                bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
 
@@ -2837,6 +2908,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
                if (!level_zero)
                        args.lod = get_src(ctx, instr->src[4]);
                args.dmask = 15;
+               args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16;
 
                ac_build_image_opcode(&ctx->ac, &args);
        }
@@ -3480,13 +3552,26 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx,
                                            unsigned bitsize)
 {
        LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
+        LLVMValueRef interp_param_f;
 
-       interp_param = LLVMBuildBitCast(ctx->ac.builder,
+       interp_param_f = LLVMBuildBitCast(ctx->ac.builder,
                                interp_param, ctx->ac.v2f32, "");
        LLVMValueRef i = LLVMBuildExtractElement(
-               ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+               ctx->ac.builder, interp_param_f, ctx->ac.i32_0, "");
        LLVMValueRef j = LLVMBuildExtractElement(
-               ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+               ctx->ac.builder, interp_param_f, ctx->ac.i32_1, "");
+
+       /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
+       if (ctx->verified_interp &&
+            !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
+               LLVMValueRef args[2];
+               args[0] = i;
+               args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
+               LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
+                                                       args, 2, AC_FUNC_ATTR_READNONE);
+               ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
+                _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
+       }
 
        LLVMValueRef values[4];
        assert(bitsize == 16 || bitsize == 32);
@@ -3819,7 +3904,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                result = visit_image_size(ctx, instr, false);
                break;
        case nir_intrinsic_shader_clock:
-               result = ac_build_shader_clock(&ctx->ac);
+               result = ac_build_shader_clock(&ctx->ac,
+                                              nir_intrinsic_memory_scope(instr));
                break;
        case nir_intrinsic_discard:
        case nir_intrinsic_discard_if:
@@ -3914,7 +4000,16 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
        case nir_intrinsic_emit_vertex:
                ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
                break;
+       case nir_intrinsic_emit_vertex_with_counter: {
+               unsigned stream = nir_intrinsic_stream_id(instr);
+               LLVMValueRef next_vertex = get_src(ctx, instr->src[0]);
+               ctx->abi->emit_vertex_with_counter(ctx->abi, stream,
+                                                  next_vertex,
+                                                  ctx->abi->outputs);
+               break;
+       }
        case nir_intrinsic_end_primitive:
+       case nir_intrinsic_end_primitive_with_counter:
                ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
                break;
        case nir_intrinsic_load_tess_coord:
@@ -3946,8 +4041,33 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                break;
        }
        case nir_intrinsic_shuffle:
-               result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
-                               get_src(ctx, instr->src[1]));
+               if (ctx->ac.chip_class == GFX8 ||
+                   ctx->ac.chip_class == GFX9 ||
+                   (ctx->ac.chip_class == GFX10 && ctx->ac.wave_size == 32)) {
+                       result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
+                                                 get_src(ctx, instr->src[1]));
+               } else {
+                       LLVMValueRef src = get_src(ctx, instr->src[0]);
+                       LLVMValueRef index = get_src(ctx, instr->src[1]);
+                       LLVMTypeRef type = LLVMTypeOf(src);
+                       struct waterfall_context wctx;
+                       LLVMValueRef index_val;
+
+                       index_val = enter_waterfall(ctx, &wctx, index, true);
+
+                       src = LLVMBuildZExt(ctx->ac.builder, src,
+                                           ctx->ac.i32, "");
+
+                       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane",
+                                                   ctx->ac.i32,
+                                                   (LLVMValueRef []) { src, index_val }, 2,
+                                                   AC_FUNC_ATTR_READNONE |
+                                                   AC_FUNC_ATTR_CONVERGENT);
+
+                       result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
+
+                       result = exit_waterfall(ctx, &wctx, result);
+               }
                break;
        case nir_intrinsic_reduce:
                result = ac_build_reduce(&ctx->ac,
@@ -4282,20 +4402,18 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                }
        }
 
+       LLVMValueRef texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr,
+                                                                   &instr->instr, false);
        if (!sampler_deref_instr)
                sampler_deref_instr = texture_deref_instr;
 
-       LLVMValueRef texture_dynamic_index = NULL, sampler_dynamic_index = NULL;
-       if (texture_deref_instr) {
-               texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr, &instr->instr, false);
-               texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, instr->texture_non_uniform);
-       }
+        LLVMValueRef sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr,
+                                                                   &instr->instr, false);
+       if (instr->texture_non_uniform)
+               texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true);
 
-       if (sampler_deref_instr && sampler_deref_instr != texture_deref_instr) {
-               sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr, &instr->instr, false);
-               sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, instr->sampler_non_uniform);
-       } else
-               sampler_dynamic_index = texture_dynamic_index;
+       if (instr->sampler_non_uniform)
+               sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true);
 
        enum ac_descriptor_type main_descriptor = instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
 
@@ -4370,8 +4488,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                        offset_src = i;
                        break;
                case nir_tex_src_bias:
-                       if (instr->op == nir_texop_txb)
-                               args.bias = get_src(ctx, instr->src[i].src);
+                       args.bias = get_src(ctx, instr->src[i].src);
                        break;
                case nir_tex_src_lod: {
                        if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
@@ -4391,6 +4508,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                case nir_tex_src_ddy:
                        ddy = get_src(ctx, instr->src[i].src);
                        break;
+               case nir_tex_src_min_lod:
+                       args.min_lod = get_src(ctx, instr->src[i].src);
+                       break;
                case nir_tex_src_texture_offset:
                case nir_tex_src_sampler_offset:
                case nir_tex_src_plane:
@@ -4406,6 +4526,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 
        if (instr->op == nir_texop_texture_samples) {
                LLVMValueRef res, samples, is_msaa;
+               LLVMValueRef default_sample;
+
                res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
                samples = LLVMBuildExtractElement(ctx->ac.builder, res,
                                                  LLVMConstInt(ctx->ac.i32, 3, false), "");
@@ -4422,8 +4544,27 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                                       LLVMConstInt(ctx->ac.i32, 0xf, false), "");
                samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
                                       samples, "");
+
+               if (ctx->abi->robust_buffer_access) {
+                       LLVMValueRef dword1, is_null_descriptor;
+
+                       /* Extract the second dword of the descriptor, if it's
+                        * all zero, then it's a null descriptor.
+                        */
+                       dword1 = LLVMBuildExtractElement(ctx->ac.builder, res,
+                                                        LLVMConstInt(ctx->ac.i32, 1, false), "");
+                       is_null_descriptor =
+                               LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
+                                             LLVMConstInt(ctx->ac.i32, 0, false), "");
+                       default_sample =
+                               LLVMBuildSelect(ctx->ac.builder, is_null_descriptor,
+                                               ctx->ac.i32_0, ctx->ac.i32_1, "");
+               } else {
+                       default_sample = ctx->ac.i32_1;
+               }
+
                samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
-                                         ctx->ac.i32_1, "");
+                                         default_sample, "");
                result = samples;
                goto write_result;
        }
@@ -4631,6 +4772,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
                }
        }
 
+       assert(instr->dest.is_ssa);
+       args.d16 = instr->dest.ssa.bit_size == 16;
+
        result = build_tex_intrinsic(ctx, instr, &args);
 
        if (instr->op == nir_texop_query_levels)
@@ -4701,19 +4845,46 @@ static void phi_post_pass(struct ac_nir_context *ctx)
 }
 
 
+static bool is_def_used_in_an_export(const nir_ssa_def* def) {
+       nir_foreach_use(use_src, def) {
+               if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
+                       nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr);
+                       if (instr->intrinsic == nir_intrinsic_store_deref)
+                               return true;
+               } else if (use_src->parent_instr->type == nir_instr_type_alu) {
+                       nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr);
+                       if (instr->op == nir_op_vec4 &&
+                           is_def_used_in_an_export(&instr->dest.dest.ssa)) {
+                               return true;
+                       }
+               }
+       }
+       return false;
+}
+
 static void visit_ssa_undef(struct ac_nir_context *ctx,
                            const nir_ssa_undef_instr *instr)
 {
        unsigned num_components = instr->def.num_components;
        LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
-       LLVMValueRef undef;
 
-       if (num_components == 1)
-               undef = LLVMGetUndef(type);
-       else {
-               undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+       if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) {
+               LLVMValueRef undef;
+
+               if (num_components == 1)
+                       undef = LLVMGetUndef(type);
+               else {
+                       undef = LLVMGetUndef(LLVMVectorType(type, num_components));
+               }
+               ctx->ssa_defs[instr->def.index] = undef;
+       } else {
+               LLVMValueRef zero = LLVMConstInt(type, 0, false);
+               if (num_components > 1) {
+                       zero = ac_build_gather_values_extended(
+                               &ctx->ac, &zero, 4, 0, false, false);
+               }
+               ctx->ssa_defs[instr->def.index] = zero;
        }
-       ctx->ssa_defs[instr->def.index] = undef;
 }
 
 static void visit_jump(struct ac_llvm_context *ctx,
@@ -5169,6 +5340,10 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
        ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                           _mesa_key_pointer_equal);
 
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ctx.verified_interp = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                                              _mesa_key_pointer_equal);
+
        func = (struct nir_function *)exec_list_get_head(&nir->functions);
 
        nir_index_ssa_defs(func->impl);
@@ -5203,6 +5378,8 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
        ralloc_free(ctx.defs);
        ralloc_free(ctx.phis);
        ralloc_free(ctx.vars);
+        if (ctx.abi->kill_ps_if_inf_interp)
+                ralloc_free(ctx.verified_interp);
 }
 
 bool