ac/nir: use llvm.amdgcn.rsq for nir_op_frsq
[mesa.git] / src / amd / llvm / ac_nir_to_llvm.c
index 0ecf56c8e309f5b14052653d78da2022503e826e..d1c333ac73d252004c6a54a003cf9ce3394f77e5 100644 (file)
@@ -60,10 +60,16 @@ struct ac_nir_context {
        LLVMValueRef *locals;
 };
 
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+                                          nir_deref_instr *deref_instr,
+                                          const nir_instr *instr,
+                                          bool image);
+
 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
                                     nir_deref_instr *deref_instr,
                                     enum ac_descriptor_type desc_type,
                                     const nir_instr *instr,
+                                    LLVMValueRef index,
                                     bool image, bool write);
 
 static void
@@ -490,6 +496,93 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
        return result;
 }
 
+struct waterfall_context {
+       LLVMBasicBlockRef phi_bb[2];
+       bool use_waterfall;
+};
+
+/* To deal with divergent descriptors we can create a loop that handles all
+ * lanes with the same descriptor on a given iteration (henceforth a
+ * waterfall loop).
+ *
+ * These helper create the begin and end of the loop leaving the caller
+ * to implement the body.
+ * 
+ * params:
+ *  - ctx is the usal nir context
+ *  - wctx is a temporary struct containing some loop info. Can be left uninitialized.
+ *  - value is the possibly divergent value for which we built the loop
+ *  - divergent is whether value is actually divergent. If false we just pass
+ *     things through.
+ */
+static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx,
+                                   struct waterfall_context *wctx,
+                                   LLVMValueRef value, bool divergent)
+{
+       /* If the app claims the value is divergent but it is constant we can
+        * end up with a dynamic index of NULL. */
+       if (!value)
+               divergent = false;
+
+       wctx->use_waterfall = divergent;
+       if (!divergent)
+               return value;
+
+       ac_build_bgnloop(&ctx->ac, 6000);
+
+       LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL);
+
+       LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value,
+                                           scalar_value, "uniform_active");
+
+       wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder);
+       ac_build_ifcc(&ctx->ac, active, 6001);
+
+       return scalar_value;
+}
+
+static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx,
+                                  struct waterfall_context *wctx,
+                                  LLVMValueRef value)
+{
+       LLVMValueRef ret = NULL;
+       LLVMValueRef phi_src[2];
+       LLVMValueRef cc_phi_src[2] = {
+               LLVMConstInt(ctx->ac.i32, 0, false),
+               LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
+       };
+
+       if (!wctx->use_waterfall)
+               return value;
+
+       wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder);
+
+       ac_build_endif(&ctx->ac, 6001);
+
+       if (value) {
+               phi_src[0] = LLVMGetUndef(LLVMTypeOf(value));
+               phi_src[1] = value;
+
+               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb);
+       }
+
+       /*
+        * By using the optimization barrier on the exit decision, we decouple
+        * the operations from the break, and hence avoid LLVM hoisting the
+        * opteration into the break block.
+        */
+       LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb);
+       ac_build_optimization_barrier(&ctx->ac, &cc);
+
+       LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2");
+       ac_build_ifcc(&ctx->ac, active, 6002);
+       ac_build_break(&ctx->ac);
+       ac_build_endif(&ctx->ac, 6002);
+
+       ac_build_endloop(&ctx->ac, 6000);
+       return ret;
+}
+
 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 {
        LLVMValueRef src[4], result = NULL;
@@ -595,8 +688,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
                break;
        case nir_op_frcp:
-               src[0] = ac_to_float(&ctx->ac, src[0]);
-               result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
+               result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rcp",
+                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
                break;
        case nir_op_iand:
                result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
@@ -741,9 +834,8 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
                                              ac_to_float_type(&ctx->ac, def_type), src[0]);
                break;
        case nir_op_frsq:
-               result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
-                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
-               result = ac_build_fdiv(&ctx->ac, LLVMConstReal(LLVMTypeOf(result), 1.0), result);
+               result = emit_intrin_1f_param(&ctx->ac, "llvm.amdgcn.rsq",
+                                             ac_to_float_type(&ctx->ac, def_type), src[0]);
                break;
        case nir_op_frexp_exp:
                src[0] = ac_to_float(&ctx->ac, src[0]);
@@ -1585,6 +1677,15 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
        return cache_policy;
 }
 
+static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx,
+                                        struct waterfall_context *wctx,
+                                        const nir_intrinsic_instr *instr,
+                                        nir_src src)
+{
+       return enter_waterfall(ctx, wctx, get_src(ctx, src),
+                              nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
 static void visit_store_ssbo(struct ac_nir_context *ctx,
                              nir_intrinsic_instr *instr)
 {
@@ -1601,8 +1702,10 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
        bool writeonly_memory = access & ACCESS_NON_READABLE;
        unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
 
-       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-                                       get_src(ctx, instr->src[1]), true);
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
+
+       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true);
        LLVMValueRef base_data = src_data;
        base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
        LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
@@ -1678,6 +1781,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
                }
        }
 
+       exit_waterfall(ctx, &wctx, NULL);
+
        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7000);
 }
@@ -1741,7 +1846,7 @@ static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
 }
 
 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
 {
        if (ctx->ac.postponed_kill) {
                LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
@@ -1756,6 +1861,9 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
        LLVMValueRef result;
        int arg_count = 0;
 
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
        switch (instr->intrinsic) {
        case nir_intrinsic_ssbo_atomic_add:
                op = "add";
@@ -1792,64 +1900,66 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
        }
 
        descriptor = ctx->abi->load_ssbo(ctx->abi,
-                                        get_src(ctx, instr->src[0]),
+                                        rsrc_base,
                                         true);
 
        if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap &&
            return_type == ctx->ac.i64) {
                result = emit_ssbo_comp_swap_64(ctx, descriptor,
-                                               get_src(ctx, instr->src[1]),
-                                               get_src(ctx, instr->src[2]),
-                                               get_src(ctx, instr->src[3]));
-               if (ctx->ac.postponed_kill)
-                       ac_build_endif(&ctx->ac, 7001);
-               return result;
-       }
-       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
-               params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
-       }
-       params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
-       params[arg_count++] = descriptor;
+                                               get_src(ctx, instr->src[1]),
+                                               get_src(ctx, instr->src[2]),
+                                               get_src(ctx, instr->src[3]));
+       } else {
+               if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
+                       params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
+               }
+               params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+               params[arg_count++] = descriptor;
 
-       if (LLVM_VERSION_MAJOR >= 9) {
-               /* XXX: The new raw/struct atomic intrinsics are buggy with
-                * LLVM 8, see r358579.
-                */
-               params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-               params[arg_count++] = ctx->ac.i32_0; /* soffset */
-               params[arg_count++] = ctx->ac.i32_0; /* slc */
+               if (LLVM_VERSION_MAJOR >= 9) {
+                       /* XXX: The new raw/struct atomic intrinsics are buggy with
+                       * LLVM 8, see r358579.
+                       */
+                       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+                       params[arg_count++] = ctx->ac.i32_0; /* soffset */
+                       params[arg_count++] = ctx->ac.i32_0; /* slc */
+
+                       ac_build_type_name_for_intr(return_type, type, sizeof(type));
+                       snprintf(name, sizeof(name),
+                                "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
+               } else {
+                       params[arg_count++] = ctx->ac.i32_0; /* vindex */
+                       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+                       params[arg_count++] = ctx->ac.i1false; /* slc */
 
-               ac_build_type_name_for_intr(return_type, type, sizeof(type));
-               snprintf(name, sizeof(name),
-                        "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
-       } else {
-               params[arg_count++] = ctx->ac.i32_0; /* vindex */
-               params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-               params[arg_count++] = ctx->ac.i1false; /* slc */
+                       assert(return_type == ctx->ac.i32);
+                       snprintf(name, sizeof(name),
+                                "llvm.amdgcn.buffer.atomic.%s", op);
+               }
 
-               assert(return_type == ctx->ac.i32);
-               snprintf(name, sizeof(name),
-                        "llvm.amdgcn.buffer.atomic.%s", op);
+               result = ac_build_intrinsic(&ctx->ac, name, return_type, params,
+                                           arg_count, 0);
        }
 
-       result = ac_build_intrinsic(&ctx->ac, name, return_type, params,
-                                   arg_count, 0);
-       if (ctx->ac.postponed_kill)
+       result = exit_waterfall(ctx, &wctx, result);
+        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7001);
        return result;
 }
 
 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
 {
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
        int elem_size_bytes = instr->dest.ssa.bit_size / 8;
        int num_components = instr->num_components;
        enum gl_access_qualifier access = nir_intrinsic_access(instr);
        unsigned cache_policy = get_cache_policy(ctx, access, false, false);
 
        LLVMValueRef offset = get_src(ctx, instr->src[1]);
-       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-                                               get_src(ctx, instr->src[0]), false);
+       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false);
        LLVMValueRef vindex = ctx->ac.i32_0;
 
        LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
@@ -1904,14 +2014,26 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
                i += num_elems;
        }
 
-       return ac_build_gather_values(&ctx->ac, results, num_components);
+       LLVMValueRef ret =  ac_build_gather_values(&ctx->ac, results, num_components);
+       return exit_waterfall(ctx, &wctx, ret);
+}
+
+static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx,
+                                       struct waterfall_context *wctx,
+                                       const nir_intrinsic_instr *instr)
+{
+       return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]),
+                              nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
 }
 
 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
-                                          const nir_intrinsic_instr *instr)
+                                          nir_intrinsic_instr *instr)
 {
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr);
+
        LLVMValueRef ret;
-       LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
+       LLVMValueRef rsrc = rsrc_base;
        LLVMValueRef offset = get_src(ctx, instr->src[1]);
        int num_components = instr->num_components;
 
@@ -1953,8 +2075,10 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
                ret = ac_trim_vector(&ctx->ac, ret, num_components);
        }
 
-       return LLVMBuildBitCast(ctx->ac.builder, ret,
+       ret = LLVMBuildBitCast(ctx->ac.builder, ret,
                                get_def_type(ctx, &instr->dest.ssa), "");
+
+       return exit_waterfall(ctx, &wctx, ret);
 }
 
 static void
@@ -2455,6 +2579,7 @@ static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
 
 static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                                          const nir_intrinsic_instr *instr,
+                                         LLVMValueRef dynamic_index,
                                          enum ac_descriptor_type desc_type,
                                          bool write)
 {
@@ -2462,11 +2587,12 @@ static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ?
                nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL;
 
-       return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, true, write);
+       return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write);
 }
 
 static void get_image_coords(struct ac_nir_context *ctx,
                             const nir_intrinsic_instr *instr,
+                            LLVMValueRef dynamic_desc_index,
                             struct ac_image_args *args,
                             enum glsl_sampler_dim dim,
                             bool is_array)
@@ -2504,7 +2630,7 @@ static void get_image_coords(struct ac_nir_context *ctx,
                                                               fmask_load_address[2],
                                                               sample_index,
                                                               get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-                                                                               AC_DESC_FMASK, &instr->instr, true, false));
+                                                                               AC_DESC_FMASK, &instr->instr, dynamic_desc_index, true, false));
        }
        if (count == 1 && !gfx9_1d) {
                if (instr->src[1].ssa->num_components)
@@ -2556,9 +2682,10 @@ static void get_image_coords(struct ac_nir_context *ctx,
 
 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
                                                 const nir_intrinsic_instr *instr,
+                                               LLVMValueRef dynamic_index,
                                                bool write, bool atomic)
 {
-       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
+       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write);
        if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) {
                LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
                LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
@@ -2574,6 +2701,19 @@ static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
        return rsrc;
 }
 
+static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
+                                         struct waterfall_context *wctx,
+                                         const nir_intrinsic_instr *instr)
+{
+       nir_deref_instr *deref_instr = NULL;
+
+       if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref)
+               deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+
+       LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
+       return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                                     const nir_intrinsic_instr *instr,
                                     bool bindless)
@@ -2596,6 +2736,9 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
        struct ac_image_args args = {};
 
        args.cache_policy = get_cache_policy(ctx, access, false, false);
@@ -2605,7 +2748,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                unsigned num_channels = util_last_bit(mask);
                LLVMValueRef rsrc, vindex;
 
-               rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
+               rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false);
                vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                 ctx->ac.i32_0, "");
 
@@ -2622,8 +2765,8 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
 
                args.opcode = level_zero ? ac_image_load : ac_image_load_mip;
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
                if (!level_zero)
                        args.lod = get_src(ctx, instr->src[3]);
@@ -2632,11 +2775,11 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 
                res = ac_build_image_opcode(&ctx->ac, &args);
        }
-       return res;
+       return exit_waterfall(ctx, &wctx, res);
 }
 
 static void visit_image_store(struct ac_nir_context *ctx,
-                             nir_intrinsic_instr *instr,
+                             const nir_intrinsic_instr *instr,
                              bool bindless)
 {
        if (ctx->ac.postponed_kill) {
@@ -2648,6 +2791,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
        enum glsl_sampler_dim dim;
        enum gl_access_qualifier access;
        bool is_array;
+
        if (bindless) {
                dim = nir_intrinsic_image_dim(instr);
                access = nir_intrinsic_access(instr);
@@ -2661,13 +2805,16 @@ static void visit_image_store(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
        bool writeonly_memory = access & ACCESS_NON_READABLE;
        struct ac_image_args args = {};
 
        args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
 
        if (dim == GLSL_SAMPLER_DIM_BUF) {
-               LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
+               LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false);
                LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
                unsigned src_channels = ac_get_llvm_num_components(src);
                LLVMValueRef vindex;
@@ -2687,8 +2834,8 @@ static void visit_image_store(struct ac_nir_context *ctx,
 
                args.opcode = level_zero ? ac_image_store : ac_image_store_mip;
                args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
                if (!level_zero)
                        args.lod = get_src(ctx, instr->src[4]);
@@ -2697,13 +2844,14 @@ static void visit_image_store(struct ac_nir_context *ctx,
                ac_build_image_opcode(&ctx->ac, &args);
        }
 
+       exit_waterfall(ctx, &wctx, NULL);
        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7003);
 }
 
 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr,
-                                       bool bindless)
+                                    const nir_intrinsic_instr *instr,
+                                    bool bindless)
 {
        if (ctx->ac.postponed_kill) {
                LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
@@ -2739,6 +2887,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
        switch (instr->intrinsic) {
        case nir_intrinsic_bindless_image_atomic_add:
        case nir_intrinsic_image_deref_atomic_add:
@@ -2821,7 +2972,7 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 
        LLVMValueRef result;
        if (dim == GLSL_SAMPLER_DIM_BUF) {
-               params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
+               params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true);
                params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                                ctx->ac.i32_0, ""); /* vindex */
                params[param_count++] = ctx->ac.i32_0; /* voffset */
@@ -2851,24 +3002,29 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                args.data[0] = params[0];
                if (cmpswap)
                        args.data[1] = params[1];
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
 
                result = ac_build_image_opcode(&ctx->ac, &args);
        }
 
+       result = exit_waterfall(ctx, &wctx, result);
        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7004);
        return result;
 }
 
 static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr)
+                                       nir_intrinsic_instr *instr)
 {
-       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
 
-       return ac_build_image_get_sample_count(&ctx->ac, rsrc);
+       LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
+
+       return exit_waterfall(ctx, &wctx, ret);
 }
 
 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
@@ -2888,35 +3044,40 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
-       if (dim == GLSL_SAMPLER_DIM_BUF)
-               return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true);
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
 
-       struct ac_image_args args = { 0 };
+       if (dim == GLSL_SAMPLER_DIM_BUF) {
+               res =  get_buffer_size(ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true);
+       } else {
 
-       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
-       args.dmask = 0xf;
-       args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-       args.opcode = ac_image_get_resinfo;
-       args.lod = ctx->ac.i32_0;
-       args.attributes = AC_FUNC_ATTR_READNONE;
+               struct ac_image_args args = { 0 };
 
-       res = ac_build_image_opcode(&ctx->ac, &args);
+               args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+               args.dmask = 0xf;
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+               args.opcode = ac_image_get_resinfo;
+               args.lod = ctx->ac.i32_0;
+               args.attributes = AC_FUNC_ATTR_READNONE;
 
-       LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+               res = ac_build_image_opcode(&ctx->ac, &args);
 
-       if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
-               LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
-               LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-               z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
-               res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
-       }
-       if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
-               LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-               res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
-                                               ctx->ac.i32_1, "");
+               LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
 
+               if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
+                       LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
+                       LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+                       z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
+                       res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
+               }
+
+               if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
+                       LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+                       res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
+                                                    ctx->ac.i32_1, "");
+               }
        }
-       return res;
+       return exit_waterfall(ctx, &wctx, res);
 }
 
 static void emit_membar(struct ac_llvm_context *ac,
@@ -3788,8 +3949,33 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                break;
        }
        case nir_intrinsic_shuffle:
-               result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
-                               get_src(ctx, instr->src[1]));
+               if (ctx->ac.chip_class == GFX8 ||
+                   ctx->ac.chip_class == GFX9 ||
+                   (ctx->ac.chip_class == GFX10 && ctx->ac.wave_size == 32)) {
+                       result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
+                                                 get_src(ctx, instr->src[1]));
+               } else {
+                       LLVMValueRef src = get_src(ctx, instr->src[0]);
+                       LLVMValueRef index = get_src(ctx, instr->src[1]);
+                       LLVMTypeRef type = LLVMTypeOf(src);
+                       struct waterfall_context wctx;
+                       LLVMValueRef index_val;
+
+                       index_val = enter_waterfall(ctx, &wctx, index, true);
+
+                       src = LLVMBuildZExt(ctx->ac.builder, src,
+                                           ctx->ac.i32, "");
+
+                       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane",
+                                                   ctx->ac.i32,
+                                                   (LLVMValueRef []) { src, index_val }, 2,
+                                                   AC_FUNC_ATTR_READNONE |
+                                                   AC_FUNC_ATTR_CONVERGENT);
+
+                       result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
+
+                       result = exit_waterfall(ctx, &wctx, result);
+               }
                break;
        case nir_intrinsic_reduce:
                result = ac_build_reduce(&ctx->ac,
@@ -3947,11 +4133,20 @@ static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx,
        return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
 }
 
-static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
-                                    nir_deref_instr *deref_instr,
-                                    enum ac_descriptor_type desc_type,
-                                    const nir_instr *instr,
-                                    bool image, bool write)
+struct sampler_desc_address {
+       unsigned descriptor_set;
+       unsigned base_index; /* binding in vulkan */
+       unsigned constant_index;
+       LLVMValueRef dynamic_index;
+       bool image;
+       bool bindless;
+};
+
+static struct sampler_desc_address
+get_sampler_desc_internal(struct ac_nir_context *ctx,
+                         nir_deref_instr *deref_instr,
+                         const nir_instr *instr,
+                         bool image)
 {
        LLVMValueRef index = NULL;
        unsigned constant_index = 0;
@@ -4024,12 +4219,40 @@ static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
                } else
                        base_index = deref_instr->var->data.binding;
        }
+       return (struct sampler_desc_address) {
+               .descriptor_set = descriptor_set,
+               .base_index = base_index,
+               .constant_index = constant_index,
+               .dynamic_index = index,
+               .image = image,
+               .bindless = bindless,
+       };
+}
 
+/* Extract any possibly divergent index into a separate value that can be fed
+ * into get_sampler_desc with the same arguments. */
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+                                          nir_deref_instr *deref_instr,
+                                          const nir_instr *instr,
+                                          bool image)
+{
+       struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
+       return addr.dynamic_index;
+}
+
+static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
+                                    nir_deref_instr *deref_instr,
+                                    enum ac_descriptor_type desc_type,
+                                    const nir_instr *instr,
+                                    LLVMValueRef index,
+                                    bool image, bool write)
+{
+       struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
        return ctx->abi->load_sampler_desc(ctx->abi,
-                                         descriptor_set,
-                                         base_index,
-                                         constant_index, index,
-                                         desc_type, image, write, bindless);
+                                         addr.descriptor_set,
+                                         addr.base_index,
+                                         addr.constant_index, index,
+                                         desc_type, addr.image, write, addr.bindless);
 }
 
 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
@@ -4063,6 +4286,7 @@ static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
 
 static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                           nir_tex_instr *instr,
+                          struct waterfall_context *wctx,
                           LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
                           LLVMValueRef *fmask_ptr)
 {
@@ -4086,9 +4310,19 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                }
        }
 
+       LLVMValueRef texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr,
+                                                                   &instr->instr, false);
        if (!sampler_deref_instr)
                sampler_deref_instr = texture_deref_instr;
 
+        LLVMValueRef sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr,
+                                                                   &instr->instr, false);
+       if (instr->texture_non_uniform)
+               texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true);
+
+       if (instr->sampler_non_uniform)
+               sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true);
+
        enum ac_descriptor_type main_descriptor = instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
 
        if (plane >= 0) {
@@ -4106,16 +4340,19 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                main_descriptor = AC_DESC_FMASK;
        }
 
-       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, false, false);
+       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr,
+                                   texture_dynamic_index, false, false);
 
        if (samp_ptr) {
-               *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, false, false);
+               *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr,
+                                            sampler_dynamic_index, false, false);
                if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
                        *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
        }
        if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
                          instr->op == nir_texop_samples_identical))
-               *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, false, false);
+               *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK,
+                                             &instr->instr, texture_dynamic_index, false, false);
 }
 
 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
@@ -4134,8 +4371,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
        LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
        LLVMValueRef ddx = NULL, ddy = NULL;
        unsigned offset_src = 0;
+       struct waterfall_context wctx[2] = {{{0}}};
 
-       tex_fetch_ptrs(ctx, instr, &args.resource, &args.sampler, &fmask_ptr);
+       tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr);
 
        for (unsigned i = 0; i < instr->num_srcs; i++) {
                switch (instr->src[i].src_type) {
@@ -4450,11 +4688,15 @@ write_result:
        if (result) {
                assert(instr->dest.is_ssa);
                result = ac_to_integer(&ctx->ac, result);
+
+               for (int i = ARRAY_SIZE(wctx); --i >= 0;) {
+                       result =  exit_waterfall(ctx, wctx + i, result);
+               }
+
                ctx->ssa_defs[instr->dest.ssa.index] = result;
        }
 }
 
-
 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
 {
        LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);