amd/llvm: Fix divergent descriptor indexing. (v3)
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Mon, 2 Dec 2019 08:53:37 +0000 (09:53 +0100)
committerMarge Bot <eric+marge@anholt.net>
Thu, 12 Mar 2020 16:12:02 +0000 (16:12 +0000)
There are multiple LLVM passes that very much move the
intrinsic using the descriptor outside of the loop, defeating
the entire point of creating the loop.

Defeat the optimizer by  splitting the break into a separate
if-statement and putting an optimization barrier on the bool
in between.

v2: Move from a callback based system to begin/end loop.
    This does not make it significantly less intrusive but
    is a bit nicer with all the extra struct and callback
    stubs.
v3: Deal with non-divergent values in divergent path.

Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/2160
Fixes: 028ce527395 "radv: Add non-uniform indexing lowering."
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4109>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4109>

src/amd/llvm/ac_nir_to_llvm.c
src/amd/vulkan/radv_pipeline.c

index 0ecf56c8e309f5b14052653d78da2022503e826e..fec68c469137bfce31eac300f0e51a76595eda36 100644 (file)
@@ -60,10 +60,16 @@ struct ac_nir_context {
        LLVMValueRef *locals;
 };
 
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+                                          nir_deref_instr *deref_instr,
+                                          const nir_instr *instr,
+                                          bool image);
+
 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
                                     nir_deref_instr *deref_instr,
                                     enum ac_descriptor_type desc_type,
                                     const nir_instr *instr,
+                                    LLVMValueRef index,
                                     bool image, bool write);
 
 static void
@@ -490,6 +496,93 @@ static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
        return result;
 }
 
+struct waterfall_context {
+       LLVMBasicBlockRef phi_bb[2];
+       bool use_waterfall;
+};
+
+/* To deal with divergent descriptors we can create a loop that handles all
+ * lanes with the same descriptor on a given iteration (henceforth a
+ * waterfall loop).
+ *
+ * These helper create the begin and end of the loop leaving the caller
+ * to implement the body.
+ * 
+ * params:
+ *  - ctx is the usal nir context
+ *  - wctx is a temporary struct containing some loop info. Can be left uninitialized.
+ *  - value is the possibly divergent value for which we built the loop
+ *  - divergent is whether value is actually divergent. If false we just pass
+ *     things through.
+ */
+static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx,
+                                   struct waterfall_context *wctx,
+                                   LLVMValueRef value, bool divergent)
+{
+       /* If the app claims the value is divergent but it is constant we can
+        * end up with a dynamic index of NULL. */
+       if (!value)
+               divergent = false;
+
+       wctx->use_waterfall = divergent;
+       if (!divergent)
+               return value;
+
+       ac_build_bgnloop(&ctx->ac, 6000);
+
+       LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL);
+
+       LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value,
+                                           scalar_value, "uniform_active");
+
+       wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder);
+       ac_build_ifcc(&ctx->ac, active, 6001);
+
+       return scalar_value;
+}
+
+static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx,
+                                  struct waterfall_context *wctx,
+                                  LLVMValueRef value)
+{
+       LLVMValueRef ret = NULL;
+       LLVMValueRef phi_src[2];
+       LLVMValueRef cc_phi_src[2] = {
+               LLVMConstInt(ctx->ac.i32, 0, false),
+               LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
+       };
+
+       if (!wctx->use_waterfall)
+               return value;
+
+       wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder);
+
+       ac_build_endif(&ctx->ac, 6001);
+
+       if (value) {
+               phi_src[0] = LLVMGetUndef(LLVMTypeOf(value));
+               phi_src[1] = value;
+
+               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb);
+       }
+
+       /*
+        * By using the optimization barrier on the exit decision, we decouple
+        * the operations from the break, and hence avoid LLVM hoisting the
+        * opteration into the break block.
+        */
+       LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb);
+       ac_build_optimization_barrier(&ctx->ac, &cc);
+
+       LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2");
+       ac_build_ifcc(&ctx->ac, active, 6002);
+       ac_build_break(&ctx->ac);
+       ac_build_endif(&ctx->ac, 6002);
+
+       ac_build_endloop(&ctx->ac, 6000);
+       return ret;
+}
+
 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 {
        LLVMValueRef src[4], result = NULL;
@@ -1585,6 +1678,15 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
        return cache_policy;
 }
 
+static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx,
+                                        struct waterfall_context *wctx,
+                                        const nir_intrinsic_instr *instr,
+                                        nir_src src)
+{
+       return enter_waterfall(ctx, wctx, get_src(ctx, src),
+                              nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
 static void visit_store_ssbo(struct ac_nir_context *ctx,
                              nir_intrinsic_instr *instr)
 {
@@ -1601,8 +1703,10 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
        bool writeonly_memory = access & ACCESS_NON_READABLE;
        unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
 
-       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-                                       get_src(ctx, instr->src[1]), true);
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
+
+       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true);
        LLVMValueRef base_data = src_data;
        base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
        LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
@@ -1678,6 +1782,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
                }
        }
 
+       exit_waterfall(ctx, &wctx, NULL);
+
        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7000);
 }
@@ -1741,7 +1847,7 @@ static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx,
 }
 
 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
 {
        if (ctx->ac.postponed_kill) {
                LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
@@ -1756,6 +1862,9 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
        LLVMValueRef result;
        int arg_count = 0;
 
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
        switch (instr->intrinsic) {
        case nir_intrinsic_ssbo_atomic_add:
                op = "add";
@@ -1792,64 +1901,66 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
        }
 
        descriptor = ctx->abi->load_ssbo(ctx->abi,
-                                        get_src(ctx, instr->src[0]),
+                                        rsrc_base,
                                         true);
 
        if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap &&
            return_type == ctx->ac.i64) {
                result = emit_ssbo_comp_swap_64(ctx, descriptor,
-                                               get_src(ctx, instr->src[1]),
-                                               get_src(ctx, instr->src[2]),
-                                               get_src(ctx, instr->src[3]));
-               if (ctx->ac.postponed_kill)
-                       ac_build_endif(&ctx->ac, 7001);
-               return result;
-       }
-       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
-               params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
-       }
-       params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
-       params[arg_count++] = descriptor;
+                                               get_src(ctx, instr->src[1]),
+                                               get_src(ctx, instr->src[2]),
+                                               get_src(ctx, instr->src[3]));
+       } else {
+               if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
+                       params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
+               }
+               params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+               params[arg_count++] = descriptor;
 
-       if (LLVM_VERSION_MAJOR >= 9) {
-               /* XXX: The new raw/struct atomic intrinsics are buggy with
-                * LLVM 8, see r358579.
-                */
-               params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-               params[arg_count++] = ctx->ac.i32_0; /* soffset */
-               params[arg_count++] = ctx->ac.i32_0; /* slc */
+               if (LLVM_VERSION_MAJOR >= 9) {
+                       /* XXX: The new raw/struct atomic intrinsics are buggy with
+                       * LLVM 8, see r358579.
+                       */
+                       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+                       params[arg_count++] = ctx->ac.i32_0; /* soffset */
+                       params[arg_count++] = ctx->ac.i32_0; /* slc */
+
+                       ac_build_type_name_for_intr(return_type, type, sizeof(type));
+                       snprintf(name, sizeof(name),
+                                "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
+               } else {
+                       params[arg_count++] = ctx->ac.i32_0; /* vindex */
+                       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
+                       params[arg_count++] = ctx->ac.i1false; /* slc */
 
-               ac_build_type_name_for_intr(return_type, type, sizeof(type));
-               snprintf(name, sizeof(name),
-                        "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
-       } else {
-               params[arg_count++] = ctx->ac.i32_0; /* vindex */
-               params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
-               params[arg_count++] = ctx->ac.i1false; /* slc */
+                       assert(return_type == ctx->ac.i32);
+                       snprintf(name, sizeof(name),
+                                "llvm.amdgcn.buffer.atomic.%s", op);
+               }
 
-               assert(return_type == ctx->ac.i32);
-               snprintf(name, sizeof(name),
-                        "llvm.amdgcn.buffer.atomic.%s", op);
+               result = ac_build_intrinsic(&ctx->ac, name, return_type, params,
+                                           arg_count, 0);
        }
 
-       result = ac_build_intrinsic(&ctx->ac, name, return_type, params,
-                                   arg_count, 0);
-       if (ctx->ac.postponed_kill)
+       result = exit_waterfall(ctx, &wctx, result);
+        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7001);
        return result;
 }
 
 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
-                                      const nir_intrinsic_instr *instr)
+                                      nir_intrinsic_instr *instr)
 {
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
+
        int elem_size_bytes = instr->dest.ssa.bit_size / 8;
        int num_components = instr->num_components;
        enum gl_access_qualifier access = nir_intrinsic_access(instr);
        unsigned cache_policy = get_cache_policy(ctx, access, false, false);
 
        LLVMValueRef offset = get_src(ctx, instr->src[1]);
-       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-                                               get_src(ctx, instr->src[0]), false);
+       LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false);
        LLVMValueRef vindex = ctx->ac.i32_0;
 
        LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
@@ -1904,14 +2015,26 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
                i += num_elems;
        }
 
-       return ac_build_gather_values(&ctx->ac, results, num_components);
+       LLVMValueRef ret =  ac_build_gather_values(&ctx->ac, results, num_components);
+       return exit_waterfall(ctx, &wctx, ret);
+}
+
+static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx,
+                                       struct waterfall_context *wctx,
+                                       const nir_intrinsic_instr *instr)
+{
+       return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]),
+                              nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
 }
 
 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
-                                          const nir_intrinsic_instr *instr)
+                                          nir_intrinsic_instr *instr)
 {
+       struct waterfall_context wctx;
+       LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr);
+
        LLVMValueRef ret;
-       LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
+       LLVMValueRef rsrc = rsrc_base;
        LLVMValueRef offset = get_src(ctx, instr->src[1]);
        int num_components = instr->num_components;
 
@@ -1953,8 +2076,10 @@ static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
                ret = ac_trim_vector(&ctx->ac, ret, num_components);
        }
 
-       return LLVMBuildBitCast(ctx->ac.builder, ret,
+       ret = LLVMBuildBitCast(ctx->ac.builder, ret,
                                get_def_type(ctx, &instr->dest.ssa), "");
+
+       return exit_waterfall(ctx, &wctx, ret);
 }
 
 static void
@@ -2455,6 +2580,7 @@ static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
 
 static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                                          const nir_intrinsic_instr *instr,
+                                         LLVMValueRef dynamic_index,
                                          enum ac_descriptor_type desc_type,
                                          bool write)
 {
@@ -2462,11 +2588,12 @@ static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
                instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ?
                nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL;
 
-       return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, true, write);
+       return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write);
 }
 
 static void get_image_coords(struct ac_nir_context *ctx,
                             const nir_intrinsic_instr *instr,
+                            LLVMValueRef dynamic_desc_index,
                             struct ac_image_args *args,
                             enum glsl_sampler_dim dim,
                             bool is_array)
@@ -2504,7 +2631,7 @@ static void get_image_coords(struct ac_nir_context *ctx,
                                                               fmask_load_address[2],
                                                               sample_index,
                                                               get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-                                                                               AC_DESC_FMASK, &instr->instr, true, false));
+                                                                               AC_DESC_FMASK, &instr->instr, dynamic_desc_index, true, false));
        }
        if (count == 1 && !gfx9_1d) {
                if (instr->src[1].ssa->num_components)
@@ -2556,9 +2683,10 @@ static void get_image_coords(struct ac_nir_context *ctx,
 
 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
                                                 const nir_intrinsic_instr *instr,
+                                               LLVMValueRef dynamic_index,
                                                bool write, bool atomic)
 {
-       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
+       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write);
        if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) {
                LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
                LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
@@ -2574,6 +2702,15 @@ static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
        return rsrc;
 }
 
+static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
+                                         struct waterfall_context *wctx,
+                                         const nir_intrinsic_instr *instr)
+{
+       nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+       LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
+       return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
+}
+
 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                                     const nir_intrinsic_instr *instr,
                                     bool bindless)
@@ -2596,6 +2733,9 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
        struct ac_image_args args = {};
 
        args.cache_policy = get_cache_policy(ctx, access, false, false);
@@ -2605,7 +2745,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                unsigned num_channels = util_last_bit(mask);
                LLVMValueRef rsrc, vindex;
 
-               rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
+               rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false);
                vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                 ctx->ac.i32_0, "");
 
@@ -2622,8 +2762,8 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
                bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
 
                args.opcode = level_zero ? ac_image_load : ac_image_load_mip;
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
                if (!level_zero)
                        args.lod = get_src(ctx, instr->src[3]);
@@ -2632,11 +2772,11 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 
                res = ac_build_image_opcode(&ctx->ac, &args);
        }
-       return res;
+       return exit_waterfall(ctx, &wctx, res);
 }
 
 static void visit_image_store(struct ac_nir_context *ctx,
-                             nir_intrinsic_instr *instr,
+                             const nir_intrinsic_instr *instr,
                              bool bindless)
 {
        if (ctx->ac.postponed_kill) {
@@ -2648,6 +2788,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
        enum glsl_sampler_dim dim;
        enum gl_access_qualifier access;
        bool is_array;
+
        if (bindless) {
                dim = nir_intrinsic_image_dim(instr);
                access = nir_intrinsic_access(instr);
@@ -2661,13 +2802,16 @@ static void visit_image_store(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
        bool writeonly_memory = access & ACCESS_NON_READABLE;
        struct ac_image_args args = {};
 
        args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
 
        if (dim == GLSL_SAMPLER_DIM_BUF) {
-               LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
+               LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false);
                LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
                unsigned src_channels = ac_get_llvm_num_components(src);
                LLVMValueRef vindex;
@@ -2687,8 +2831,8 @@ static void visit_image_store(struct ac_nir_context *ctx,
 
                args.opcode = level_zero ? ac_image_store : ac_image_store_mip;
                args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
                if (!level_zero)
                        args.lod = get_src(ctx, instr->src[4]);
@@ -2697,13 +2841,14 @@ static void visit_image_store(struct ac_nir_context *ctx,
                ac_build_image_opcode(&ctx->ac, &args);
        }
 
+       exit_waterfall(ctx, &wctx, NULL);
        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7003);
 }
 
 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr,
-                                       bool bindless)
+                                    const nir_intrinsic_instr *instr,
+                                    bool bindless)
 {
        if (ctx->ac.postponed_kill) {
                LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder,
@@ -2739,6 +2884,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+
        switch (instr->intrinsic) {
        case nir_intrinsic_bindless_image_atomic_add:
        case nir_intrinsic_image_deref_atomic_add:
@@ -2821,7 +2969,7 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 
        LLVMValueRef result;
        if (dim == GLSL_SAMPLER_DIM_BUF) {
-               params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
+               params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true);
                params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
                                                                ctx->ac.i32_0, ""); /* vindex */
                params[param_count++] = ctx->ac.i32_0; /* voffset */
@@ -2851,24 +2999,29 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
                args.data[0] = params[0];
                if (cmpswap)
                        args.data[1] = params[1];
-               args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
-               get_image_coords(ctx, instr, &args, dim, is_array);
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
+               get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
                args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
 
                result = ac_build_image_opcode(&ctx->ac, &args);
        }
 
+       result = exit_waterfall(ctx, &wctx, result);
        if (ctx->ac.postponed_kill)
                ac_build_endif(&ctx->ac, 7004);
        return result;
 }
 
 static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx,
-                                       const nir_intrinsic_instr *instr)
+                                       nir_intrinsic_instr *instr)
 {
-       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
+       LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
 
-       return ac_build_image_get_sample_count(&ctx->ac, rsrc);
+       LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
+
+       return exit_waterfall(ctx, &wctx, ret);
 }
 
 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
@@ -2888,35 +3041,40 @@ static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
                is_array = glsl_sampler_type_is_array(type);
        }
 
-       if (dim == GLSL_SAMPLER_DIM_BUF)
-               return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true);
+       struct waterfall_context wctx;
+       LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
 
-       struct ac_image_args args = { 0 };
+       if (dim == GLSL_SAMPLER_DIM_BUF) {
+               res =  get_buffer_size(ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true);
+       } else {
 
-       args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
-       args.dmask = 0xf;
-       args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
-       args.opcode = ac_image_get_resinfo;
-       args.lod = ctx->ac.i32_0;
-       args.attributes = AC_FUNC_ATTR_READNONE;
+               struct ac_image_args args = { 0 };
 
-       res = ac_build_image_opcode(&ctx->ac, &args);
+               args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
+               args.dmask = 0xf;
+               args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
+               args.opcode = ac_image_get_resinfo;
+               args.lod = ctx->ac.i32_0;
+               args.attributes = AC_FUNC_ATTR_READNONE;
 
-       LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+               res = ac_build_image_opcode(&ctx->ac, &args);
 
-       if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
-               LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
-               LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-               z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
-               res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
-       }
-       if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
-               LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
-               res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
-                                               ctx->ac.i32_1, "");
+               LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
+
+               if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
+                       LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
+                       LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+                       z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
+                       res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
+               }
 
+               if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
+                       LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
+                       res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
+                                                    ctx->ac.i32_1, "");
+               }
        }
-       return res;
+       return exit_waterfall(ctx, &wctx, res);
 }
 
 static void emit_membar(struct ac_llvm_context *ac,
@@ -3947,11 +4105,20 @@ static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx,
        return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
 }
 
-static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
-                                    nir_deref_instr *deref_instr,
-                                    enum ac_descriptor_type desc_type,
-                                    const nir_instr *instr,
-                                    bool image, bool write)
+struct sampler_desc_address {
+       unsigned descriptor_set;
+       unsigned base_index; /* binding in vulkan */
+       unsigned constant_index;
+       LLVMValueRef dynamic_index;
+       bool image;
+       bool bindless;
+};
+
+static struct sampler_desc_address
+get_sampler_desc_internal(struct ac_nir_context *ctx,
+                         nir_deref_instr *deref_instr,
+                         const nir_instr *instr,
+                         bool image)
 {
        LLVMValueRef index = NULL;
        unsigned constant_index = 0;
@@ -4024,12 +4191,40 @@ static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
                } else
                        base_index = deref_instr->var->data.binding;
        }
+       return (struct sampler_desc_address) {
+               .descriptor_set = descriptor_set,
+               .base_index = base_index,
+               .constant_index = constant_index,
+               .dynamic_index = index,
+               .image = image,
+               .bindless = bindless,
+       };
+}
 
+/* Extract any possibly divergent index into a separate value that can be fed
+ * into get_sampler_desc with the same arguments. */
+static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx,
+                                          nir_deref_instr *deref_instr,
+                                          const nir_instr *instr,
+                                          bool image)
+{
+       struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
+       return addr.dynamic_index;
+}
+
+static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
+                                    nir_deref_instr *deref_instr,
+                                    enum ac_descriptor_type desc_type,
+                                    const nir_instr *instr,
+                                    LLVMValueRef index,
+                                    bool image, bool write)
+{
+       struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
        return ctx->abi->load_sampler_desc(ctx->abi,
-                                         descriptor_set,
-                                         base_index,
-                                         constant_index, index,
-                                         desc_type, image, write, bindless);
+                                         addr.descriptor_set,
+                                         addr.base_index,
+                                         addr.constant_index, index,
+                                         desc_type, addr.image, write, addr.bindless);
 }
 
 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
@@ -4063,6 +4258,7 @@ static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
 
 static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                           nir_tex_instr *instr,
+                          struct waterfall_context *wctx,
                           LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
                           LLVMValueRef *fmask_ptr)
 {
@@ -4089,6 +4285,18 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
        if (!sampler_deref_instr)
                sampler_deref_instr = texture_deref_instr;
 
+       LLVMValueRef texture_dynamic_index = NULL, sampler_dynamic_index = NULL;
+       if (texture_deref_instr) {
+               texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr, &instr->instr, false);
+               texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, instr->texture_non_uniform);
+       }
+
+       if (sampler_deref_instr && sampler_deref_instr != texture_deref_instr) {
+               sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr, &instr->instr, false);
+               sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, instr->sampler_non_uniform);
+       } else
+               sampler_dynamic_index = texture_dynamic_index;
+
        enum ac_descriptor_type main_descriptor = instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
 
        if (plane >= 0) {
@@ -4106,16 +4314,19 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx,
                main_descriptor = AC_DESC_FMASK;
        }
 
-       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, false, false);
+       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr,
+                                   texture_dynamic_index, false, false);
 
        if (samp_ptr) {
-               *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, false, false);
+               *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr,
+                                            sampler_dynamic_index, false, false);
                if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
                        *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
        }
        if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
                          instr->op == nir_texop_samples_identical))
-               *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, false, false);
+               *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK,
+                                             &instr->instr, texture_dynamic_index, false, false);
 }
 
 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
@@ -4134,8 +4345,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
        LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
        LLVMValueRef ddx = NULL, ddy = NULL;
        unsigned offset_src = 0;
+       struct waterfall_context wctx[2] = {{{0}}};
 
-       tex_fetch_ptrs(ctx, instr, &args.resource, &args.sampler, &fmask_ptr);
+       tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr);
 
        for (unsigned i = 0; i < instr->num_srcs; i++) {
                switch (instr->src[i].src_type) {
@@ -4450,11 +4662,15 @@ write_result:
        if (result) {
                assert(instr->dest.is_ssa);
                result = ac_to_integer(&ctx->ac, result);
+
+               for (int i = ARRAY_SIZE(wctx); --i >= 0;) {
+                       result =  exit_waterfall(ctx, wctx + i, result);
+               }
+
                ctx->ssa_defs[instr->dest.ssa.index] = result;
        }
 }
 
-
 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
 {
        LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
index 2505452211900f9d675fdcb8a73ff574aff33364..da83aa562bb600bfb5cfd94ac847c1acf063182a 100644 (file)
@@ -2824,13 +2824,13 @@ void radv_create_shaders(struct radv_pipeline *pipeline,
 
        for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
                if (nir[i]) {
-                       NIR_PASS_V(nir[i], nir_lower_non_uniform_access,
-                                          nir_lower_non_uniform_ubo_access |
-                                          nir_lower_non_uniform_ssbo_access |
-                                          nir_lower_non_uniform_texture_access |
-                                          nir_lower_non_uniform_image_access);
-
-                       if (!use_aco)
+                       if (use_aco) {
+                               NIR_PASS_V(nir[i], nir_lower_non_uniform_access,
+                                          nir_lower_non_uniform_ubo_access |
+                                          nir_lower_non_uniform_ssbo_access |
+                                          nir_lower_non_uniform_texture_access |
+                                          nir_lower_non_uniform_image_access);
+                       } else
                                NIR_PASS_V(nir[i], nir_lower_bool_to_int32);
                }
        }