X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcommon%2Fac_llvm_build.c;h=5640a23b8aa32a0db52e0a2eb360a97bc7f39ef9;hb=5060c51b6f4dfb0d5358bde6523285163d3faaad;hp=8996159bf221e543caf4536a606a81ea36e6a6cf;hpb=7e1faa79d31b44e3b3d7a4dc22d9b136300a761e;p=mesa.git diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 8996159bf22..5640a23b8aa 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -33,20 +33,26 @@ #include #include "ac_llvm_util.h" - +#include "ac_exp_param.h" #include "util/bitscan.h" #include "util/macros.h" +#include "util/u_atomic.h" #include "sid.h" +#include "shader_enums.h" + /* Initialize module-independent parts of the context. * * The caller is responsible for initializing ctx::module and ctx::builder. */ void -ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context) +ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context, + enum chip_class chip_class) { LLVMValueRef args[1]; + ctx->chip_class = chip_class; + ctx->context = context; ctx->module = NULL; ctx->builder = NULL; @@ -54,11 +60,26 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context) ctx->voidt = LLVMVoidTypeInContext(ctx->context); ctx->i1 = LLVMInt1TypeInContext(ctx->context); ctx->i8 = LLVMInt8TypeInContext(ctx->context); + ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); + ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); + ctx->f16 = LLVMHalfTypeInContext(ctx->context); ctx->f32 = LLVMFloatTypeInContext(ctx->context); + ctx->f64 = LLVMDoubleTypeInContext(ctx->context); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); + ctx->v3i32 = LLVMVectorType(ctx->i32, 3); ctx->v4i32 = LLVMVectorType(ctx->i32, 4); + ctx->v2f32 = LLVMVectorType(ctx->f32, 2); ctx->v4f32 = LLVMVectorType(ctx->f32, 4); - ctx->v16i8 = LLVMVectorType(ctx->i8, 16); + ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + + ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); + ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); + ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); + ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); + + ctx->i1false = LLVMConstInt(ctx->i1, 0, false); + ctx->i1true = LLVMConstInt(ctx->i1, 1, false); ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5); @@ -77,6 +98,92 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context) ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); } +unsigned +ac_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMFloatTypeKind: + return 4; + case LLVMDoubleTypeKind: + case LLVMPointerTypeKind: + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * + ac_get_type_size(LLVMGetElementType(type)); + case LLVMArrayTypeKind: + return LLVMGetArrayLength(type) * + ac_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; + } +} + +static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (t == ctx->f16 || t == ctx->i16) + return ctx->i16; + else if (t == ctx->f32 || t == ctx->i32) + return ctx->i32; + else if (t == ctx->f64 || t == ctx->i64) + return ctx->i64; + else + unreachable("Unhandled integer size"); +} + +LLVMTypeRef +ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(t); + return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), + LLVMGetVectorSize(t)); + } + return to_integer_type_scalar(ctx, t); +} + +LLVMValueRef +ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); +} + +static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (t == ctx->i16 || t == ctx->f16) + return ctx->f16; + else if (t == ctx->i32 || t == ctx->f32) + return ctx->f32; + else if (t == ctx->i64 || t == ctx->f64) + return ctx->f64; + else + unreachable("Unhandled float size"); +} + +LLVMTypeRef +ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(t); + return LLVMVectorType(to_float_type_scalar(ctx, elem_type), + LLVMGetVectorSize(t)); + } + return to_float_type_scalar(ctx, t); +} + +LLVMValueRef +ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) +{ + LLVMTypeRef type = LLVMTypeOf(v); + return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); +} + + LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, LLVMTypeRef return_type, LLVMValueRef *params, @@ -114,20 +221,6 @@ ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, return call; } -static LLVMValueRef bitcast_to_float(struct ac_llvm_context *ctx, - LLVMValueRef value) -{ - LLVMTypeRef type = LLVMTypeOf(value); - LLVMTypeRef new_type; - - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) - new_type = LLVMVectorType(ctx->f32, LLVMGetVectorSize(type)); - else - new_type = ctx->f32; - - return LLVMBuildBitCast(ctx->builder, value, new_type, ""); -} - /** * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with * intrinsic names). @@ -165,18 +258,131 @@ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) } } +/** + * Helper function that builds an LLVM IR PHI node and immediately adds + * incoming edges. + */ +LLVMValueRef +ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, + unsigned count_incoming, LLVMValueRef *values, + LLVMBasicBlockRef *blocks) +{ + LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); + LLVMAddIncoming(phi, values, blocks, count_incoming); + return phi; +} + +/* Prevent optimizations (at least of memory accesses) across the current + * point in the program by emitting empty inline assembly that is marked as + * having side effects. + * + * Optionally, a value can be passed through the inline assembly to prevent + * LLVM from hoisting calls to ReadNone functions. + */ +void +ac_build_optimization_barrier(struct ac_llvm_context *ctx, + LLVMValueRef *pvgpr) +{ + static int counter = 0; + + LLVMBuilderRef builder = ctx->builder; + char code[16]; + + snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter)); + + if (!pvgpr) { + LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); + LLVMBuildCall(builder, inlineasm, NULL, 0, ""); + } else { + LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); + LLVMValueRef vgpr = *pvgpr; + LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr); + unsigned vgpr_size = ac_get_type_size(vgpr_type); + LLVMValueRef vgpr0; + + assert(vgpr_size % 4 == 0); + + vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); + vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); + vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); + vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); + vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); + + *pvgpr = vgpr; + } +} + +LLVMValueRef +ac_build_ballot(struct ac_llvm_context *ctx, + LLVMValueRef value) +{ + LLVMValueRef args[3] = { + value, + ctx->i32_0, + LLVMConstInt(ctx->i32, LLVMIntNE, 0) + }; + + /* We currently have no other way to prevent LLVM from lifting the icmp + * calls to a dominating basic block. + */ + ac_build_optimization_barrier(ctx, &args[0]); + + if (LLVMTypeOf(args[0]) != ctx->i32) + args[0] = LLVMBuildBitCast(ctx->builder, args[0], ctx->i32, ""); + + return ac_build_intrinsic(ctx, + "llvm.amdgcn.icmp.i32", + ctx->i64, args, 3, + AC_FUNC_ATTR_NOUNWIND | + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +LLVMValueRef +ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); +} + +LLVMValueRef +ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, + LLVMConstInt(ctx->i64, 0, 0), ""); +} + +LLVMValueRef +ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) +{ + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + + LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + vote_set, active_set, ""); + LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + vote_set, + LLVMConstInt(ctx->i64, 0, 0), ""); + return LLVMBuildOr(ctx->builder, all, none, ""); +} + LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, unsigned value_count, unsigned value_stride, - bool load) + bool load, + bool always_vector) { LLVMBuilderRef builder = ctx->builder; LLVMValueRef vec = NULL; unsigned i; - if (value_count == 1) { + if (value_count == 1 && !always_vector) { if (load) return LLVMBuildLoad(builder, values[0], ""); return values[0]; @@ -201,7 +407,7 @@ ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, unsigned value_count) { - return ac_build_gather_values_extended(ctx, values, value_count, 1, false); + return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); } LLVMValueRef @@ -231,42 +437,16 @@ build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3], struct cube_selection_coords *out) { - LLVMBuilderRef builder = ctx->builder; - - if (HAVE_LLVM >= 0x0309) { - LLVMTypeRef f32 = ctx->f32; - - out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", - f32, in, 3, AC_FUNC_ATTR_READNONE); - } else { - LLVMValueRef c[4] = { - in[0], - in[1], - in[2], - LLVMGetUndef(LLVMTypeOf(in[0])) - }; - LLVMValueRef vec = ac_build_gather_values(ctx, c, 4); - - LLVMValueRef tmp = - ac_build_intrinsic(ctx, "llvm.AMDGPU.cube", - LLVMTypeOf(vec), &vec, 1, - AC_FUNC_ATTR_READNONE); - - out->stc[1] = LLVMBuildExtractElement(builder, tmp, - LLVMConstInt(ctx->i32, 0, 0), ""); - out->stc[0] = LLVMBuildExtractElement(builder, tmp, - LLVMConstInt(ctx->i32, 1, 0), ""); - out->ma = LLVMBuildExtractElement(builder, tmp, - LLVMConstInt(ctx->i32, 2, 0), ""); - out->id = LLVMBuildExtractElement(builder, tmp, - LLVMConstInt(ctx->i32, 3, 0), ""); - } + LLVMTypeRef f32 = ctx->f32; + + out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", + f32, in, 3, AC_FUNC_ATTR_READNONE); + out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", + f32, in, 3, AC_FUNC_ATTR_READNONE); + out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", + f32, in, 3, AC_FUNC_ATTR_READNONE); + out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", + f32, in, 3, AC_FUNC_ATTR_READNONE); } /** @@ -278,12 +458,13 @@ build_cube_intrinsic(struct ac_llvm_context *ctx, * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards * the selcoords major axis. */ -static void build_cube_select(LLVMBuilderRef builder, +static void build_cube_select(struct ac_llvm_context *ctx, const struct cube_selection_coords *selcoords, const LLVMValueRef *coords, LLVMValueRef *out_st, LLVMValueRef *out_ma) { + LLVMBuilderRef builder = ctx->builder; LLVMTypeRef f32 = LLVMTypeOf(coords[0]); LLVMValueRef is_ma_positive; LLVMValueRef sgn_ma; @@ -305,29 +486,29 @@ static void build_cube_select(LLVMBuilderRef builder, is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); /* Select sc */ - tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], coords[0], ""); + tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0), - LLVMBuildSelect(builder, is_ma_x, sgn_ma, + LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); /* Select tc */ tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); - sgn = LLVMBuildSelect(builder, is_ma_y, LLVMBuildFNeg(builder, sgn_ma, ""), + sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), ""); out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); /* Select ma */ tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); - sgn = LLVMBuildSelect(builder, is_ma_positive, - LLVMConstReal(f32, 2.0), LLVMConstReal(f32, -2.0), ""); - *out_ma = LLVMBuildFMul(builder, tmp, sgn, ""); + tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", + ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); + *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); } void ac_prepare_cube_coords(struct ac_llvm_context *ctx, - bool is_deriv, bool is_array, + bool is_deriv, bool is_array, bool is_lod, LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg) { @@ -337,6 +518,38 @@ ac_prepare_cube_coords(struct ac_llvm_context *ctx, LLVMValueRef coords[3]; LLVMValueRef invma; + if (is_array && !is_lod) { + LLVMValueRef tmp = coords_arg[3]; + tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0); + + /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: + * + * "For Array forms, the array layer used will be + * + * max(0, min(d−1, floor(layer+0.5))) + * + * where d is the depth of the texture array and layer + * comes from the component indicated in the tables below. + * Workaroudn for an issue where the layer is taken from a + * helper invocation which happens to fall on a different + * layer due to extrapolation." + * + * VI and earlier attempt to implement this in hardware by + * clamping the value of coords[2] = (8 * layer) + face. + * Unfortunately, this means that the we end up with the wrong + * face when clamping occurs. + * + * Clamp the layer earlier to work around the issue. + */ + if (ctx->chip_class <= VI) { + LLVMValueRef ge0; + ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); + tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); + } + + coords_arg[3] = tmp; + } + build_cube_intrinsic(ctx, coords_arg, &selcoords); invma = ac_build_intrinsic(ctx, "llvm.fabs.f32", @@ -378,7 +591,7 @@ ac_prepare_cube_coords(struct ac_llvm_context *ctx, * seems awfully quiet about how textureGrad for cube * maps should be handled. */ - build_cube_select(builder, &selcoords, &derivs_arg[axis * 3], + build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma); deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); @@ -510,32 +723,40 @@ ac_build_indexed_store(struct ac_llvm_context *ctx, * \param base_ptr Where the array starts. * \param index The element index into the array. * \param uniform Whether the base_ptr and index can be assumed to be - * dynamically uniform + * dynamically uniform (i.e. load to an SGPR) + * \param invariant Whether the load is invariant (no other opcodes affect it) */ -LLVMValueRef -ac_build_indexed_load(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - bool uniform) +static LLVMValueRef +ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index, bool uniform, bool invariant) { - LLVMValueRef pointer; + LLVMValueRef pointer, result; pointer = ac_build_gep0(ctx, base_ptr, index); if (uniform) LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); - return LLVMBuildLoad(ctx->builder, pointer, ""); + result = LLVMBuildLoad(ctx->builder, pointer, ""); + if (invariant) + LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); + return result; } -/** - * Do a load from &base_ptr[index], but also add a flag that it's loading - * a constant from a dynamically uniform index. - */ -LLVMValueRef -ac_build_indexed_load_const(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) +LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index) { - LLVMValueRef result = ac_build_indexed_load(ctx, base_ptr, index, true); - LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); - return result; + return ac_build_load_custom(ctx, base_ptr, index, false, false); +} + +LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, false, true); +} + +LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, true, true); } /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. @@ -553,10 +774,13 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, bool glc, bool slc, bool writeonly_memory, - bool has_add_tid) + bool swizzle_enable_hint) { - /* TODO: Fix stores with ADD_TID and remove the "has_add_tid" flag. */ - if (HAVE_LLVM >= 0x0309 && !has_add_tid) { + /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset + * (voffset is swizzled, but soffset isn't swizzled). + * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. + */ + if (!swizzle_enable_hint) { /* Split 3 channel stores, becase LLVM doesn't support 3-channel * intrinsics. */ if (num_channels == 3) { @@ -570,11 +794,11 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, glc, slc, - writeonly_memory, has_add_tid); + writeonly_memory, swizzle_enable_hint); ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8, glc, slc, - writeonly_memory, has_add_tid); + writeonly_memory, swizzle_enable_hint); return; } @@ -590,7 +814,7 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); LLVMValueRef args[] = { - bitcast_to_float(ctx, vdata), + ac_to_float(ctx, vdata), LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), LLVMConstInt(ctx->i32, 0, 0), offset, @@ -657,114 +881,89 @@ ac_build_buffer_load(struct ac_llvm_context *ctx, unsigned inst_offset, unsigned glc, unsigned slc, - bool readonly_memory) + bool can_speculate, + bool allow_smem) { - unsigned func = CLAMP(num_channels, 1, 3) - 1; - - if (HAVE_LLVM >= 0x309) { - LLVMValueRef args[] = { - LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), - vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0), - LLVMConstInt(ctx->i32, inst_offset, 0), - LLVMConstInt(ctx->i1, glc, 0), - LLVMConstInt(ctx->i1, slc, 0) - }; - - LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2), - ctx->v4f32}; - const char *type_names[] = {"f32", "v2f32", "v4f32"}; - char name[256]; - - if (voffset) { - args[2] = LLVMBuildAdd(ctx->builder, args[2], voffset, - ""); - } - - if (soffset) { - args[2] = LLVMBuildAdd(ctx->builder, args[2], soffset, - ""); + LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); + if (voffset) + offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); + if (soffset) + offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); + + /* TODO: VI and later generations can use SMEM with GLC=1.*/ + if (allow_smem && !glc && !slc) { + assert(vindex == NULL); + + LLVMValueRef result[4]; + + for (int i = 0; i < num_channels; i++) { + if (i) { + offset = LLVMBuildAdd(ctx->builder, offset, + LLVMConstInt(ctx->i32, 4, 0), ""); + } + LLVMValueRef args[2] = {rsrc, offset}; + result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32", + ctx->f32, args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_LEGACY); } + if (num_channels == 1) + return result[0]; - snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s", - type_names[func]); - - return ac_build_intrinsic(ctx, name, types[func], args, - ARRAY_SIZE(args), - /* READNONE means writes can't - * affect it, while READONLY means - * that writes can affect it. */ - readonly_memory && HAVE_LLVM >= 0x0400 ? - AC_FUNC_ATTR_READNONE : - AC_FUNC_ATTR_READONLY); - } else { - LLVMValueRef args[] = { - LLVMBuildBitCast(ctx->builder, rsrc, ctx->v16i8, ""), - voffset ? voffset : vindex, - soffset, - LLVMConstInt(ctx->i32, inst_offset, 0), - LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen - LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen - LLVMConstInt(ctx->i32, glc, 0), - LLVMConstInt(ctx->i32, slc, 0), - LLVMConstInt(ctx->i32, 0, 0), // TFE - }; + if (num_channels == 3) + result[num_channels++] = LLVMGetUndef(ctx->f32); + return ac_build_gather_values(ctx, result, num_channels); + } - LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2), - ctx->v4i32}; - const char *type_names[] = {"i32", "v2i32", "v4i32"}; - const char *arg_type = "i32"; - char name[256]; + unsigned func = CLAMP(num_channels, 1, 3) - 1; - if (voffset && vindex) { - LLVMValueRef vaddr[] = {vindex, voffset}; + LLVMValueRef args[] = { + LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), + vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0), + offset, + LLVMConstInt(ctx->i1, glc, 0), + LLVMConstInt(ctx->i1, slc, 0) + }; - arg_type = "v2i32"; - args[1] = ac_build_gather_values(ctx, vaddr, 2); - } + LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2), + ctx->v4f32}; + const char *type_names[] = {"f32", "v2f32", "v4f32"}; + char name[256]; - snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s", - type_names[func], arg_type); + snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s", + type_names[func]); - return ac_build_intrinsic(ctx, name, types[func], args, - ARRAY_SIZE(args), AC_FUNC_ATTR_READONLY); - } + return ac_build_intrinsic(ctx, name, types[func], args, + ARRAY_SIZE(args), + /* READNONE means writes can't affect it, while + * READONLY means that writes can affect it. */ + can_speculate && HAVE_LLVM >= 0x0400 ? + AC_FUNC_ATTR_READNONE : + AC_FUNC_ATTR_READONLY); } LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vindex, LLVMValueRef voffset, - bool readonly_memory) + bool can_speculate) { - if (HAVE_LLVM >= 0x0309) { - LLVMValueRef args [] = { - LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), - vindex, - voffset, - LLVMConstInt(ctx->i1, 0, 0), /* glc */ - LLVMConstInt(ctx->i1, 0, 0), /* slc */ - }; - - return ac_build_intrinsic(ctx, - "llvm.amdgcn.buffer.load.format.v4f32", - ctx->v4f32, args, ARRAY_SIZE(args), - /* READNONE means writes can't - * affect it, while READONLY means - * that writes can affect it. */ - readonly_memory && HAVE_LLVM >= 0x0400 ? - AC_FUNC_ATTR_READNONE : - AC_FUNC_ATTR_READONLY); - } - - LLVMValueRef args[] = { - rsrc, - voffset, + LLVMValueRef args [] = { + LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""), vindex, + voffset, + ctx->i1false, /* glc */ + ctx->i1false, /* slc */ }; - return ac_build_intrinsic(ctx, "llvm.SI.vs.load.input", - ctx->v4f32, args, 3, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_LEGACY); + + return ac_build_intrinsic(ctx, + "llvm.amdgcn.buffer.load.format.v4f32", + ctx->v4f32, args, ARRAY_SIZE(args), + /* READNONE means writes can't affect it, while + * READONLY means that writes can affect it. */ + can_speculate && HAVE_LLVM >= 0x0400 ? + AC_FUNC_ATTR_READNONE : + AC_FUNC_ATTR_READONLY); } /** @@ -832,24 +1031,23 @@ ac_get_thread_id(struct ac_llvm_context *ctx) */ LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, - bool has_ds_bpermute, uint32_t mask, int idx, - LLVMValueRef lds, LLVMValueRef val) { - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2]; + LLVMValueRef tl, trbl, args[2]; LLVMValueRef result; - thread_id = ac_get_thread_id(ctx); + if (ctx->chip_class >= VI) { + LLVMValueRef thread_id, tl_tid, trbl_tid; + thread_id = ac_get_thread_id(ctx); - tl_tid = LLVMBuildAnd(ctx->builder, thread_id, - LLVMConstInt(ctx->i32, mask, false), ""); + tl_tid = LLVMBuildAnd(ctx->builder, thread_id, + LLVMConstInt(ctx->i32, mask, false), ""); - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, - LLVMConstInt(ctx->i32, idx, false), ""); + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid, + LLVMConstInt(ctx->i32, idx, false), ""); - if (has_ds_bpermute) { args[0] = LLVMBuildMul(ctx->builder, tl_tid, LLVMConstInt(ctx->i32, 4, false), ""); args[1] = val; @@ -867,15 +1065,44 @@ ac_build_ddxy(struct ac_llvm_context *ctx, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); } else { - LLVMValueRef store_ptr, load_ptr0, load_ptr1; + uint32_t masks[2] = {}; - store_ptr = ac_build_gep0(ctx, lds, thread_id); - load_ptr0 = ac_build_gep0(ctx, lds, tl_tid); - load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid); + switch (mask) { + case AC_TID_MASK_TOP_LEFT: + masks[0] = 0x8000; + if (idx == 1) + masks[1] = 0x8055; + else + masks[1] = 0x80aa; - LLVMBuildStore(ctx->builder, val, store_ptr); - tl = LLVMBuildLoad(ctx->builder, load_ptr0, ""); - trbl = LLVMBuildLoad(ctx->builder, load_ptr1, ""); + break; + case AC_TID_MASK_TOP: + masks[0] = 0x8044; + masks[1] = 0x80ee; + break; + case AC_TID_MASK_LEFT: + masks[0] = 0x80a0; + masks[1] = 0x80f5; + break; + default: + assert(0); + } + + args[0] = val; + args[1] = LLVMConstInt(ctx->i32, masks[0], false); + + tl = ac_build_intrinsic(ctx, + "llvm.amdgcn.ds.swizzle", ctx->i32, + args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + + args[1] = LLVMConstInt(ctx->i32, masks[1], false); + trbl = ac_build_intrinsic(ctx, + "llvm.amdgcn.ds.swizzle", ctx->i32, + args, 2, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); } tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, ""); @@ -929,7 +1156,7 @@ ac_build_umsb(struct ac_llvm_context *ctx, { LLVMValueRef args[2] = { arg, - LLVMConstInt(ctx->i1, 1, 0), + ctx->i1true, }; LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32", dst_type, args, ARRAY_SIZE(args), @@ -947,6 +1174,13 @@ ac_build_umsb(struct ac_llvm_context *ctx, LLVMConstInt(ctx->i32, -1, true), msb, ""); } +LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, + LLVMValueRef b) +{ + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); +} + LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) { if (HAVE_LLVM >= 0x0500) { @@ -1029,7 +1263,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, LLVMTypeRef dst_type; LLVMValueRef args[11]; unsigned num_args = 0; - const char *name; + const char *name = NULL; char intr_name[128], type[64]; if (HAVE_LLVM >= 0x0400) { @@ -1038,7 +1272,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, a->opcode == ac_image_get_lod; if (sample) - args[num_args++] = bitcast_to_float(ctx, a->addr); + args[num_args++] = ac_to_float(ctx, a->addr); else args[num_args++] = a->addr; @@ -1048,9 +1282,9 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0); if (sample) args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0); - args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* glc */ - args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* slc */ - args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* lwe */ + args[num_args++] = ctx->i1false; /* glc */ + args[num_args++] = ctx->i1false; /* slc */ + args[num_args++] = ctx->i1false; /* lwe */ args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0); switch (a->opcode) { @@ -1072,6 +1306,8 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, case ac_image_get_resinfo: name = "llvm.amdgcn.image.getresinfo"; break; + default: + unreachable("invalid image opcode"); } ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type, @@ -1175,20 +1411,26 @@ LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, AC_FUNC_ATTR_LEGACY); } -/** - * KILL, AKA discard in GLSL. - * - * \param value kill if value < 0.0 or value == NULL. - */ -void ac_build_kill(struct ac_llvm_context *ctx, LLVMValueRef value) +LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) { - if (value) { - ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt, - &value, 1, AC_FUNC_ATTR_LEGACY); - } else { - ac_build_intrinsic(ctx, "llvm.AMDGPU.kilp", ctx->voidt, - NULL, 0, AC_FUNC_ATTR_LEGACY); + assert(HAVE_LLVM >= 0x0600); + return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, + &i1, 1, AC_FUNC_ATTR_READNONE); +} + +void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) +{ + if (HAVE_LLVM >= 0x0600) { + ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, + &i1, 1, 0); + return; } + + LLVMValueRef value = LLVMBuildSelect(ctx->builder, i1, + LLVMConstReal(ctx->f32, 1), + LLVMConstReal(ctx->f32, -1), ""); + ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt, + &value, 1, AC_FUNC_ATTR_LEGACY); } LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, @@ -1216,3 +1458,353 @@ LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_LEGACY); } + +void ac_get_image_intr_name(const char *base_name, + LLVMTypeRef data_type, + LLVMTypeRef coords_type, + LLVMTypeRef rsrc_type, + char *out_name, unsigned out_len) +{ + char coords_type_name[8]; + + ac_build_type_name_for_intr(coords_type, coords_type_name, + sizeof(coords_type_name)); + + if (HAVE_LLVM <= 0x0309) { + snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name); + } else { + char data_type_name[8]; + char rsrc_type_name[8]; + + ac_build_type_name_for_intr(data_type, data_type_name, + sizeof(data_type_name)); + ac_build_type_name_for_intr(rsrc_type, rsrc_type_name, + sizeof(rsrc_type_name)); + snprintf(out_name, out_len, "%s.%s.%s.%s", base_name, + data_type_name, coords_type_name, rsrc_type_name); + } +} + +#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3) +#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5) + +enum ac_ir_type { + AC_IR_UNDEF, + AC_IR_CONST, + AC_IR_VALUE, +}; + +struct ac_vs_exp_chan +{ + LLVMValueRef value; + float const_float; + enum ac_ir_type type; +}; + +struct ac_vs_exp_inst { + unsigned offset; + LLVMValueRef inst; + struct ac_vs_exp_chan chan[4]; +}; + +struct ac_vs_exports { + unsigned num; + struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; +}; + +/* Return true if the PARAM export has been eliminated. */ +static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, + uint32_t num_outputs, + struct ac_vs_exp_inst *exp) +{ + unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ + bool is_zero[4] = {}, is_one[4] = {}; + + for (i = 0; i < 4; i++) { + /* It's a constant expression. Undef outputs are eliminated too. */ + if (exp->chan[i].type == AC_IR_UNDEF) { + is_zero[i] = true; + is_one[i] = true; + } else if (exp->chan[i].type == AC_IR_CONST) { + if (exp->chan[i].const_float == 0) + is_zero[i] = true; + else if (exp->chan[i].const_float == 1) + is_one[i] = true; + else + return false; /* other constant */ + } else + return false; + } + + /* Only certain combinations of 0 and 1 can be eliminated. */ + if (is_zero[0] && is_zero[1] && is_zero[2]) + default_val = is_zero[3] ? 0 : 1; + else if (is_one[0] && is_one[1] && is_one[2]) + default_val = is_zero[3] ? 2 : 3; + else + return false; + + /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ + LLVMInstructionEraseFromParent(exp->inst); + + /* Change OFFSET to DEFAULT_VAL. */ + for (i = 0; i < num_outputs; i++) { + if (vs_output_param_offset[i] == exp->offset) { + vs_output_param_offset[i] = + AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; + break; + } + } + return true; +} + +static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset, + uint32_t num_outputs, + struct ac_vs_exports *processed, + struct ac_vs_exp_inst *exp) +{ + unsigned p, copy_back_channels = 0; + + /* See if the output is already in the list of processed outputs. + * The LLVMValueRef comparison relies on SSA. + */ + for (p = 0; p < processed->num; p++) { + bool different = false; + + for (unsigned j = 0; j < 4; j++) { + struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; + struct ac_vs_exp_chan *c2 = &exp->chan[j]; + + /* Treat undef as a match. */ + if (c2->type == AC_IR_UNDEF) + continue; + + /* If c1 is undef but c2 isn't, we can copy c2 to c1 + * and consider the instruction duplicated. + */ + if (c1->type == AC_IR_UNDEF) { + copy_back_channels |= 1 << j; + continue; + } + + /* Test whether the channels are not equal. */ + if (c1->type != c2->type || + (c1->type == AC_IR_CONST && + c1->const_float != c2->const_float) || + (c1->type == AC_IR_VALUE && + c1->value != c2->value)) { + different = true; + break; + } + } + if (!different) + break; + + copy_back_channels = 0; + } + if (p == processed->num) + return false; + + /* If a match was found, but the matching export has undef where the new + * one has a normal value, copy the normal value to the undef channel. + */ + struct ac_vs_exp_inst *match = &processed->exp[p]; + + while (copy_back_channels) { + unsigned chan = u_bit_scan(©_back_channels); + + assert(match->chan[chan].type == AC_IR_UNDEF); + LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, + exp->chan[chan].value); + match->chan[chan] = exp->chan[chan]; + } + + /* The PARAM export is duplicated. Kill it. */ + LLVMInstructionEraseFromParent(exp->inst); + + /* Change OFFSET to the matching export. */ + for (unsigned i = 0; i < num_outputs; i++) { + if (vs_output_param_offset[i] == exp->offset) { + vs_output_param_offset[i] = match->offset; + break; + } + } + return true; +} + +void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, + LLVMValueRef main_fn, + uint8_t *vs_output_param_offset, + uint32_t num_outputs, + uint8_t *num_param_exports) +{ + LLVMBasicBlockRef bb; + bool removed_any = false; + struct ac_vs_exports exports; + + exports.num = 0; + + /* Process all LLVM instructions. */ + bb = LLVMGetFirstBasicBlock(main_fn); + while (bb) { + LLVMValueRef inst = LLVMGetFirstInstruction(bb); + + while (inst) { + LLVMValueRef cur = inst; + inst = LLVMGetNextInstruction(inst); + struct ac_vs_exp_inst exp; + + if (LLVMGetInstructionOpcode(cur) != LLVMCall) + continue; + + LLVMValueRef callee = ac_llvm_get_called_value(cur); + + if (!ac_llvm_is_function(callee)) + continue; + + const char *name = LLVMGetValueName(callee); + unsigned num_args = LLVMCountParams(callee); + + /* Check if this is an export instruction. */ + if ((num_args != 9 && num_args != 8) || + (strcmp(name, "llvm.SI.export") && + strcmp(name, "llvm.amdgcn.exp.f32"))) + continue; + + LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); + unsigned target = LLVMConstIntGetZExtValue(arg); + + if (target < V_008DFC_SQ_EXP_PARAM) + continue; + + target -= V_008DFC_SQ_EXP_PARAM; + + /* Parse the instruction. */ + memset(&exp, 0, sizeof(exp)); + exp.offset = target; + exp.inst = cur; + + for (unsigned i = 0; i < 4; i++) { + LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); + + exp.chan[i].value = v; + + if (LLVMIsUndef(v)) { + exp.chan[i].type = AC_IR_UNDEF; + } else if (LLVMIsAConstantFP(v)) { + LLVMBool loses_info; + exp.chan[i].type = AC_IR_CONST; + exp.chan[i].const_float = + LLVMConstRealGetDouble(v, &loses_info); + } else { + exp.chan[i].type = AC_IR_VALUE; + } + } + + /* Eliminate constant and duplicated PARAM exports. */ + if (ac_eliminate_const_output(vs_output_param_offset, + num_outputs, &exp) || + ac_eliminate_duplicated_output(vs_output_param_offset, + num_outputs, &exports, + &exp)) { + removed_any = true; + } else { + exports.exp[exports.num++] = exp; + } + } + bb = LLVMGetNextBasicBlock(bb); + } + + /* Remove holes in export memory due to removed PARAM exports. + * This is done by renumbering all PARAM exports. + */ + if (removed_any) { + uint8_t old_offset[VARYING_SLOT_MAX]; + unsigned out, i; + + /* Make a copy of the offsets. We need the old version while + * we are modifying some of them. */ + memcpy(old_offset, vs_output_param_offset, + sizeof(old_offset)); + + for (i = 0; i < exports.num; i++) { + unsigned offset = exports.exp[i].offset; + + /* Update vs_output_param_offset. Multiple outputs can + * have the same offset. + */ + for (out = 0; out < num_outputs; out++) { + if (old_offset[out] == offset) + vs_output_param_offset[out] = i; + } + + /* Change the PARAM offset in the instruction. */ + LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, + LLVMConstInt(ctx->i32, + V_008DFC_SQ_EXP_PARAM + i, 0)); + } + *num_param_exports = exports.num; + } +} + +void ac_init_exec_full_mask(struct ac_llvm_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + ac_build_intrinsic(ctx, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, AC_FUNC_ATTR_CONVERGENT); +} + +void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) +{ + unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768; + ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0, + LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE), + "lds"); +} + +LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, + LLVMValueRef dw_addr) +{ + return ac_build_load(ctx, ctx->lds, dw_addr); +} + +void ac_lds_store(struct ac_llvm_context *ctx, + LLVMValueRef dw_addr, + LLVMValueRef value) +{ + value = ac_to_integer(ctx, value); + ac_build_indexed_store(ctx, ctx->lds, + dw_addr, value); +} + +LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, + LLVMTypeRef dst_type, + LLVMValueRef src0) +{ + LLVMValueRef params[2] = { + src0, + + /* The value of 1 means that ffs(x=0) = undef, so LLVM won't + * add special code to check for x=0. The reason is that + * the LLVM behavior for x=0 is different from what we + * need here. However, LLVM also assumes that ffs(x) is + * in [0, 31], but GLSL expects that ffs(0) = -1, so + * a conditional assignment to handle 0 is still required. + * + * The hardware already implements the correct behavior. + */ + LLVMConstInt(ctx->i1, 1, false), + }; + + LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32, + params, 2, + AC_FUNC_ATTR_READNONE); + + /* TODO: We need an intrinsic to skip this conditional. */ + /* Check for zero: */ + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, + LLVMIntEQ, src0, + ctx->i32_0, ""), + LLVMConstInt(ctx->i32, -1, 0), lsb, ""); +}