X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fllvm%2Fac_llvm_build.c;h=77d3f7e73fbe84cf49548ad95d247b14a8a04d0b;hb=6b75262941b55960e2f73d93f85020fa6c9c2d2f;hp=13c093fb3e6ec15b8f8ed250c5efbc040f1d2aaa;hpb=c29514bd226028631b12ae92529d862f8b5de707;p=mesa.git diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 13c093fb3e6..77d3f7e73fb 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -65,8 +65,6 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, enum ac_float_mode float_mode, unsigned wave_size, unsigned ballot_mask_bits) { - LLVMValueRef args[1]; - ctx->context = LLVMContextCreate(); ctx->chip_class = chip_class; @@ -85,11 +83,15 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); + ctx->i128 = LLVMIntTypeInContext(ctx->context, 128); ctx->intptr = ctx->i32; ctx->f16 = LLVMHalfTypeInContext(ctx->context); ctx->f32 = LLVMFloatTypeInContext(ctx->context); ctx->f64 = LLVMDoubleTypeInContext(ctx->context); ctx->v2i16 = LLVMVectorType(ctx->i16, 2); + ctx->v4i16 = LLVMVectorType(ctx->i16, 4); + ctx->v2f16 = LLVMVectorType(ctx->f16, 2); + ctx->v4f16 = LLVMVectorType(ctx->f16, 4); ctx->v2i32 = LLVMVectorType(ctx->i32, 2); ctx->v3i32 = LLVMVectorType(ctx->i32, 3); ctx->v4i32 = LLVMVectorType(ctx->i32, 4); @@ -108,6 +110,8 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); + ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false); + ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false); ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); @@ -124,11 +128,6 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14); - ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6); - - args[0] = LLVMConstReal(ctx->f32, 2.5); - ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1); - ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14); @@ -432,11 +431,19 @@ ac_build_optimization_barrier(struct ac_llvm_context *ctx, } else { LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); + LLVMTypeRef type = LLVMTypeOf(*pvgpr); + unsigned bitsize = ac_get_elem_bits(ctx, type); LLVMValueRef vgpr = *pvgpr; - LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr); - unsigned vgpr_size = ac_get_type_size(vgpr_type); + LLVMTypeRef vgpr_type; + unsigned vgpr_size; LLVMValueRef vgpr0; + if (bitsize < 32) + vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, ""); + + vgpr_type = LLVMTypeOf(vgpr); + vgpr_size = ac_get_type_size(vgpr_type); + assert(vgpr_size % 4 == 0); vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); @@ -445,16 +452,18 @@ ac_build_optimization_barrier(struct ac_llvm_context *ctx, vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); + if (bitsize < 32) + vgpr = LLVMBuildTrunc(builder, vgpr, type, ""); + *pvgpr = vgpr; } } LLVMValueRef -ac_build_shader_clock(struct ac_llvm_context *ctx) +ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope) { - const char *intr = LLVM_VERSION_MAJOR >= 9 && ctx->chip_class >= GFX8 ? - "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter"; - LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0); + const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime"; + LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0); return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); } @@ -494,14 +503,23 @@ ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value) { - const char *name = LLVM_VERSION_MAJOR >= 9 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1"; + const char *name; + + if (LLVM_VERSION_MAJOR >= 9) { + if (ctx->wave_size == 64) + name = "llvm.amdgcn.icmp.i64.i1"; + else + name = "llvm.amdgcn.icmp.i32.i1"; + } else { + name = "llvm.amdgcn.icmp.i1"; + } LLVMValueRef args[3] = { value, ctx->i1false, LLVMConstInt(ctx->i32, LLVMIntNE, 0), }; - return ac_build_intrinsic(ctx, name, ctx->i64, args, 3, + return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); @@ -684,20 +702,25 @@ ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den) { - /* If we do (num / den), LLVM >= 7.0 does: - * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f)); - * - * If we do (num * (1 / den)), LLVM does: - * return num * v_rcp_f32(den); - */ - LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0); - LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, ""); - LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, ""); + unsigned type_size = ac_get_type_size(LLVMTypeOf(den)); + const char *name; - /* Use v_rcp_f32 instead of precise division. */ - if (!LLVMIsConstant(ret)) - LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp); - return ret; + /* For doubles, we need precise division to pass GLCTS. */ + if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && + type_size == 8) + return LLVMBuildFDiv(ctx->builder, num, den, ""); + + if (type_size == 2) + name = "llvm.amdgcn.rcp.f16"; + else if (type_size == 4) + name = "llvm.amdgcn.rcp.f32"; + else + name = "llvm.amdgcn.rcp.f64"; + + LLVMValueRef rcp = ac_build_intrinsic(ctx, name, LLVMTypeOf(den), + &den, 1, AC_FUNC_ATTR_READNONE); + + return LLVMBuildFMul(ctx->builder, num, rcp, ""); } /* See fast_idiv_by_const.h. */ @@ -1165,8 +1188,6 @@ ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, - unsigned num_channels, - LLVMTypeRef return_channel_type, unsigned cache_policy, bool use_format, bool structurized) @@ -1180,12 +1201,10 @@ ac_build_buffer_store_common(struct ac_llvm_context *ctx, args[idx++] = voffset ? voffset : ctx->i32_0; args[idx++] = soffset ? soffset : ctx->i32_0; args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; const char *indexing_kind = structurized ? "struct" : "raw"; char name[256], type_name[8]; - LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name)); if (use_format) { snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", @@ -1205,13 +1224,10 @@ ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef data, LLVMValueRef vindex, LLVMValueRef voffset, - unsigned num_channels, unsigned cache_policy) { - ac_build_buffer_store_common(ctx, rsrc, data, vindex, - voffset, NULL, num_channels, - ctx->f32, cache_policy, - true, true); + ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, + cache_policy, true, true); } /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. @@ -1226,8 +1242,7 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef voffset, LLVMValueRef soffset, unsigned inst_offset, - unsigned cache_policy, - bool swizzle_enable_hint) + unsigned cache_policy) { /* Split 3 channel stores, because only LLVM 9+ support 3-channel * intrinsics. */ @@ -1241,12 +1256,10 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, v01 = ac_build_gather_values(ctx, v, 2); ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, - soffset, inst_offset, cache_policy, - swizzle_enable_hint); + soffset, inst_offset, cache_policy); ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8, - cache_policy, - swizzle_enable_hint); + cache_policy); return; } @@ -1254,7 +1267,7 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, * (voffset is swizzled, but soffset isn't swizzled). * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. */ - if (!swizzle_enable_hint) { + if (!(cache_policy & ac_swizzled)) { LLVMValueRef offset = soffset; if (inst_offset) @@ -1263,7 +1276,6 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx, ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset, - num_channels, ctx->f32, cache_policy, false, false); return; } @@ -1307,6 +1319,11 @@ ac_build_buffer_load_common(struct ac_llvm_context *ctx, const char *indexing_kind = structurized ? "struct" : "raw"; char name[256], type_name[8]; + /* D16 is only supported on gfx8+ */ + assert(!use_format || + (channel_type != ctx->f16 && channel_type != ctx->i16) || + ctx->chip_class >= GFX8); + LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); @@ -1382,10 +1399,12 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef voffset, unsigned num_channels, unsigned cache_policy, - bool can_speculate) + bool can_speculate, + bool d16) { return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, - ctx->i32_0, num_channels, ctx->f32, + ctx->i32_0, num_channels, + d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true, true); } @@ -1637,7 +1656,7 @@ ac_build_opencoded_load_format(struct ac_llvm_context *ctx, } int log_recombine = 0; - if (ctx->chip_class == GFX6 && !known_aligned) { + if ((ctx->chip_class == GFX6 || ctx->chip_class == GFX10) && !known_aligned) { /* Avoid alignment restrictions by loading one byte at a time. */ load_num_channels <<= load_log_size; log_recombine = load_log_size; @@ -1921,8 +1940,7 @@ ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, if (LLVM_VERSION_MAJOR >= 9) { /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, - voffset, soffset, 1, - ctx->i16, cache_policy, + voffset, soffset, cache_policy, false, false); } else { unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; @@ -1948,8 +1966,7 @@ ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, if (LLVM_VERSION_MAJOR >= 9) { /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, - voffset, soffset, 1, - ctx->i8, cache_policy, + voffset, soffset, cache_policy, false, false); } else { unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; @@ -2044,6 +2061,8 @@ ac_build_ddxy(struct ac_llvm_context *ctx, if (result_type == ctx->f16) val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); + else if (result_type == ctx->v2f16) + val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, ""); for (unsigned i = 0; i < 4; ++i) { tl_lanes[i] = i & mask; @@ -2177,8 +2196,10 @@ ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - char name[64]; - snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); + char name[64], type[64]; + + ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.minnum.%s", type); LLVMValueRef args[2] = {a, b}; return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); @@ -2187,8 +2208,10 @@ LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - char name[64]; - snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a))); + char name[64], type[64]; + + ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.maxnum.%s", type); LLVMValueRef args[2] = {a, b}; return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); @@ -2237,13 +2260,10 @@ void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); if (a->compr) { - LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context); - LLVMTypeRef v2i16 = LLVMVectorType(i16, 2); - args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], - v2i16, ""); + ctx->v2i16, ""); args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], - v2i16, ""); + ctx->v2i16, ""); args[4] = LLVMConstInt(ctx->i1, a->done, 0); args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); @@ -2360,6 +2380,14 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <= 1); + assert((a->min_lod ? 1 : 0) + + (a->lod ? 1 : 0) + + (a->level_zero ? 1 : 0) <= 1); + assert(!a->d16 || (ctx->chip_class >= GFX8 && + a->opcode != ac_image_atomic && + a->opcode != ac_image_atomic_cmpswap && + a->opcode != ac_image_get_lod && + a->opcode != ac_image_get_resinfo)); if (a->opcode == ac_image_get_lod) { switch (dim) { @@ -2415,6 +2443,9 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); if (a->lod) args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); + if (a->min_lod) + args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, ""); + overload[num_overloads++] = sample ? ".f32" : ".i32"; args[num_args++] = a->resource; @@ -2468,7 +2499,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, char intr_name[96]; snprintf(intr_name, sizeof(intr_name), "llvm.amdgcn.image.%s%s" /* base name */ - "%s%s%s" /* sample/gather modifiers */ + "%s%s%s%s" /* sample/gather modifiers */ ".%s.%s%s%s%s", /* dimension and type overloads */ name, atomic_subop, a->compare ? ".c" : "", @@ -2476,9 +2507,10 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "", + a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname, - atomic ? "i32" : "v4f32", + atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"), overload[0], overload[1], overload[2]); LLVMTypeRef retty; @@ -2487,15 +2519,14 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) retty = ctx->voidt; else - retty = ctx->v4f32; + retty = a->d16 ? ctx->v4f16 : ctx->v4f32; LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes); - if (!sample && retty == ctx->v4f32) { - result = LLVMBuildBitCast(ctx->builder, result, - ctx->v4i32, ""); - } + if (!sample && !atomic && retty != ctx->voidt) + result = ac_to_integer(ctx, result); + return result; } @@ -2521,10 +2552,7 @@ LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) { - LLVMTypeRef v2f16 = - LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2); - - return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16, + return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, AC_FUNC_ATTR_READNONE); } @@ -2699,33 +2727,6 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags) ctx->voidt, args, 1, 0); } -LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0, - LLVMValueRef src1, LLVMValueRef src2, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.fmed3.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.fmed3.f32"; - type = ctx->f32; - } else { - intr = "llvm.amdgcn.fmed3.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - src1, - src2, - }; - return ac_build_intrinsic(ctx, intr, type, params, 3, - AC_FUNC_ATTR_READNONE); -} - LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { @@ -2800,6 +2801,12 @@ LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); switch (bitsize) { + case 128: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, + (LLVMValueRef []) { src0 }, 1, + AC_FUNC_ATTR_READNONE); + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; case 64: result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef []) { src0 }, 1, @@ -3041,6 +3048,7 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn, uint8_t *vs_output_param_offset, uint32_t num_outputs, + uint32_t skip_output_mask, uint8_t *num_param_exports) { LLVMBasicBlockRef bb; @@ -3107,12 +3115,13 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, } /* Eliminate constant and duplicated PARAM exports. */ - if (ac_eliminate_const_output(vs_output_param_offset, - num_outputs, &exp) || - ac_eliminate_duplicated_output(ctx, - vs_output_param_offset, - num_outputs, &exports, - &exp)) { + if (!((1u << target) & skip_output_mask) && + (ac_eliminate_const_output(vs_output_param_offset, + num_outputs, &exp) || + ac_eliminate_duplicated_output(ctx, + vs_output_param_offset, + num_outputs, &exports, + &exp))) { removed_any = true; } else { exports.exp[exports.num++] = exp; @@ -3564,12 +3573,14 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, } static LLVMValueRef -_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane, bool with_opt_barrier) { LLVMTypeRef type = LLVMTypeOf(src); LLVMValueRef result; - ac_build_optimization_barrier(ctx, &src); + if (with_opt_barrier) + ac_build_optimization_barrier(ctx, &src); src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); if (lane) @@ -3585,15 +3596,10 @@ _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef l return LLVMBuildTrunc(ctx->builder, result, type, ""); } -/** - * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. - * @param ctx - * @param src - * @param lane - id of the lane or NULL for the first active lane - * @return value of the lane - */ -LLVMValueRef -ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +static LLVMValueRef +ac_build_readlane_common(struct ac_llvm_context *ctx, + LLVMValueRef src, LLVMValueRef lane, + bool with_opt_barrier) { LLVMTypeRef src_type = LLVMTypeOf(src); src = ac_to_integer(ctx, src); @@ -3607,14 +3613,19 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la LLVMBuildBitCast(ctx->builder, src, vec_type, ""); ret = LLVMGetUndef(vec_type); for (unsigned i = 0; i < bits / 32; i++) { + LLVMValueRef ret_comp; + src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); - LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane); + + ret_comp = _ac_build_readlane(ctx, src, lane, + with_opt_barrier); + ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); } } else { - ret = _ac_build_readlane(ctx, src, lane); + ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier); } if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) @@ -3622,6 +3633,30 @@ ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef la return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } +/** + * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic. + * + * The optimization barrier is not needed if the value is the same in all lanes + * or if this is called in the outermost block. + * + * @param ctx + * @param src + * @param lane - id of the lane or NULL for the first active lane + * @return value of the lane + */ +LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, + LLVMValueRef src, LLVMValueRef lane) +{ + return ac_build_readlane_common(ctx, src, lane, false); +} + + +LLVMValueRef +ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +{ + return ac_build_readlane_common(ctx, src, lane, true); +} + LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane) { @@ -3638,9 +3673,7 @@ ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) (LLVMValueRef []) { mask, ctx->i32_0 }, 2, AC_FUNC_ATTR_READNONE); } - LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, - LLVMVectorType(ctx->i32, 2), - ""); + LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, ""); LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, ""); LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, @@ -3696,15 +3729,22 @@ _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl) { - return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", - LLVMTypeOf(old), - (LLVMValueRef[]) { - old, src, - LLVMConstInt(ctx->i32, dpp_ctrl, 0), - LLVMConstInt(ctx->i32, row_mask, 0), - LLVMConstInt(ctx->i32, bank_mask, 0), - LLVMConstInt(ctx->i1, bound_ctrl, 0) }, - 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef res; + + old = LLVMBuildZExt(ctx->builder, old, ctx->i32, ""); + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + res = ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32, + (LLVMValueRef[]) { + old, src, + LLVMConstInt(ctx->i32, dpp_ctrl, 0), + LLVMConstInt(ctx->i32, row_mask, 0), + LLVMConstInt(ctx->i32, bank_mask, 0), + LLVMConstInt(ctx->i1, bound_ctrl, 0) }, + 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, res, type, ""); } static LLVMValueRef @@ -3717,10 +3757,7 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, old = ac_to_integer(ctx, old); unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); LLVMValueRef ret; - if (bits == 32) { - ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, - bank_mask, bound_ctrl); - } else { + if (bits > 32) { assert(bits % 32 == 0); LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); LLVMValueRef src_vector = @@ -3745,6 +3782,9 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, LLVMConstInt(ctx->i32, i, 0), ""); } + } else { + ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, + bank_mask, bound_ctrl); } return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } @@ -3753,6 +3793,11 @@ static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, bool exchange_rows, bool bound_ctrl) { + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + LLVMValueRef args[6] = { src, src, @@ -3761,10 +3806,13 @@ _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel ctx->i1true, /* fi */ bound_ctrl ? ctx->i1true : ctx->i1false, }; - return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" - : "llvm.amdgcn.permlane16", - ctx->i32, args, 6, - AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + result = ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" + : "llvm.amdgcn.permlane16", + ctx->i32, args, 6, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, result, type, ""); } static LLVMValueRef @@ -3775,10 +3823,7 @@ ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, src = ac_to_integer(ctx, src); unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); LLVMValueRef ret; - if (bits == 32) { - ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, - bound_ctrl); - } else { + if (bits > 32) { assert(bits % 32 == 0); LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); LLVMValueRef src_vector = @@ -3797,6 +3842,9 @@ ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, LLVMConstInt(ctx->i32, i, 0), ""); } + } else { + ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, + bound_ctrl); } return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } @@ -3811,10 +3859,17 @@ ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) { - return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", - LLVMTypeOf(src), (LLVMValueRef []) { + LLVMTypeRef src_type = LLVMTypeOf(src); + LLVMValueRef ret; + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32, + (LLVMValueRef []) { src, LLVMConstInt(ctx->i32, mask, 0) }, - 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, ret, src_type, ""); } LLVMValueRef @@ -3824,9 +3879,7 @@ ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask src = ac_to_integer(ctx, src); unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); LLVMValueRef ret; - if (bits == 32) { - ret = _ac_build_ds_swizzle(ctx, src, mask); - } else { + if (bits > 32) { assert(bits % 32 == 0); LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); LLVMValueRef src_vector = @@ -3843,6 +3896,8 @@ ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask LLVMConstInt(ctx->i32, i, 0), ""); } + } else { + ret = _ac_build_ds_swizzle(ctx, src, mask); } return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } @@ -3850,12 +3905,27 @@ ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) { + LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned bitsize = ac_get_elem_bits(ctx, src_type); char name[32], type[8]; + LLVMValueRef ret; + + src = ac_to_integer(ctx, src); + + if (bitsize < 32) + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); - return ac_build_intrinsic(ctx, name, LLVMTypeOf(src), - (LLVMValueRef []) { src }, 1, - AC_FUNC_ATTR_READNONE); + ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), + (LLVMValueRef []) { src }, 1, + AC_FUNC_ATTR_READNONE); + + if (bitsize < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, + ac_to_integer_type(ctx, src_type), ""); + + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } static LLVMValueRef @@ -3864,8 +3934,15 @@ ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, { char name[33], type[8]; LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned bitsize = ac_get_elem_bits(ctx, src_type); src = ac_to_integer(ctx, src); inactive = ac_to_integer(ctx, inactive); + + if (bitsize < 32) { + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, ""); + } + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); LLVMValueRef ret = @@ -3874,6 +3951,8 @@ ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, src, inactive }, 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + if (bitsize < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, src_type, ""); return ret; } @@ -3881,7 +3960,39 @@ ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size) { - if (type_size == 4) { + if (type_size == 1) { + switch (op) { + case nir_op_iadd: return ctx->i8_0; + case nir_op_imul: return ctx->i8_1; + case nir_op_imin: return LLVMConstInt(ctx->i8, INT8_MAX, 0); + case nir_op_umin: return LLVMConstInt(ctx->i8, UINT8_MAX, 0); + case nir_op_imax: return LLVMConstInt(ctx->i8, INT8_MIN, 0); + case nir_op_umax: return ctx->i8_0; + case nir_op_iand: return LLVMConstInt(ctx->i8, -1, 0); + case nir_op_ior: return ctx->i8_0; + case nir_op_ixor: return ctx->i8_0; + default: + unreachable("bad reduction intrinsic"); + } + } else if (type_size == 2) { + switch (op) { + case nir_op_iadd: return ctx->i16_0; + case nir_op_fadd: return ctx->f16_0; + case nir_op_imul: return ctx->i16_1; + case nir_op_fmul: return ctx->f16_1; + case nir_op_imin: return LLVMConstInt(ctx->i16, INT16_MAX, 0); + case nir_op_umin: return LLVMConstInt(ctx->i16, UINT16_MAX, 0); + case nir_op_fmin: return LLVMConstReal(ctx->f16, INFINITY); + case nir_op_imax: return LLVMConstInt(ctx->i16, INT16_MIN, 0); + case nir_op_umax: return ctx->i16_0; + case nir_op_fmax: return LLVMConstReal(ctx->f16, -INFINITY); + case nir_op_iand: return LLVMConstInt(ctx->i16, -1, 0); + case nir_op_ior: return ctx->i16_0; + case nir_op_ixor: return ctx->i16_0; + default: + unreachable("bad reduction intrinsic"); + } + } else if (type_size == 4) { switch (op) { case nir_op_iadd: return ctx->i32_0; case nir_op_fadd: return ctx->f32_0; @@ -3924,6 +4035,7 @@ static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op) { bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; + bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4; switch (op) { case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); @@ -3936,8 +4048,8 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), lhs, rhs, ""); case nir_op_fmin: return ac_build_intrinsic(ctx, - _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32", - _64bit ? ctx->f64 : ctx->f32, + _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16", + _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); case nir_op_imax: return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), @@ -3946,8 +4058,8 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), lhs, rhs, ""); case nir_op_fmax: return ac_build_intrinsic(ctx, - _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32", - _64bit ? ctx->f64 : ctx->f32, + _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16", + _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); @@ -3957,11 +4069,80 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, } } +/** + * \param src The value to shift. + * \param identity The value to use the first lane. + * \param maxprefix specifies that the result only needs to be correct for a + * prefix of this many threads + * \return src, shifted 1 lane up, and identity shifted into lane 0. + */ +static LLVMValueRef +ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef identity, unsigned maxprefix) +{ + if (ctx->chip_class >= GFX10) { + /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + + tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + + tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); + + if (maxprefix > 32) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp2 = LLVMBuildSelect(ctx->builder, active, + ac_build_readlane(ctx, src, + LLVMConstInt(ctx->i32, 31, false)), + tmp2, ""); + + active = LLVMBuildOr(ctx->builder, active, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 0x1f, false), ""), + LLVMConstInt(ctx->i32, 0x10, false), ""), ""); + return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } else if (maxprefix > 16) { + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, + LLVMConstInt(ctx->i32, 16, false), ""); + + return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } + } else if (ctx->chip_class >= GFX8) { + return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); + } + + /* wavefront shift_right by 1 on SI/CI */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""), + LLVMConstInt(ctx->i32, 0x4, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""), + LLVMConstInt(ctx->i32, 0x8, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""), + LLVMConstInt(ctx->i32, 0x10, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), ""); + return LLVMBuildSelect(ctx->builder, active, identity, tmp1, ""); +} + /** * \param maxprefix specifies that the result only needs to be correct for a * prefix of this many threads - * - * TODO: add inclusive and excluse scan functions for GFX6. */ static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity, @@ -3969,13 +4150,54 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu { LLVMValueRef result, tmp; - if (ctx->chip_class >= GFX10) { - result = inclusive ? src : identity; - } else { - if (!inclusive) - src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); - result = src; + if (!inclusive) + src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix); + + result = src; + + if (ctx->chip_class <= GFX7) { + assert(maxprefix == 64); + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef active; + tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + return result; } + if (maxprefix <= 1) return result; tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); @@ -4000,33 +4222,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu return result; if (ctx->chip_class >= GFX10) { - /* dpp_row_bcast{15,31} are not supported on gfx10. */ - LLVMBuilderRef builder = ctx->builder; LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef cc; - /* TODO-GFX10: Can we get better code-gen by putting this into - * a branch so that LLVM generates EXEC mask manipulations? */ - if (inclusive) - tmp = result; - else - tmp = ac_build_alu_op(ctx, result, src, op); - tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false); - tmp = ac_build_alu_op(ctx, result, tmp, op); - cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), ""); - cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, ""); - result = LLVMBuildSelect(builder, cc, tmp, result, ""); + LLVMValueRef active; + + tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); + + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, + LLVMConstInt(ctx->i32, 16, false), ""), + ctx->i32_0, ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 32) return result; - if (inclusive) - tmp = result; - else - tmp = ac_build_alu_op(ctx, result, src, op); - tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false)); - tmp = ac_build_alu_op(ctx, result, tmp, op); - cc = LLVMBuildICmp(builder, LLVMIntUGE, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - result = LLVMBuildSelect(builder, cc, tmp, result, ""); + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + + active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, + LLVMConstInt(ctx->i32, 32, false), ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); return result; } @@ -4131,12 +4351,15 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign if (cluster_size == 32) return ac_build_wwm(ctx, result); if (ctx->chip_class >= GFX8) { - if (ctx->chip_class >= GFX10) - swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); - else - swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); - result = ac_build_alu_op(ctx, result, swap, op); - result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); + if (ctx->wave_size == 64) { + if (ctx->chip_class >= GFX10) + swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + else + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); + result = ac_build_alu_op(ctx, result, swap, op); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); + } + return ac_build_wwm(ctx, result); } else { swap = ac_build_readlane(ctx, result, ctx->i32_0); @@ -4405,7 +4628,7 @@ ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, } else if (bitsize == 32) { intr = "llvm.canonicalize.f32"; type = ctx->f32; - } else if (bitsize == 64) { + } else { intr = "llvm.canonicalize.f64"; type = ctx->f64; } @@ -4447,6 +4670,24 @@ ac_build_load_helper_invocation(struct ac_llvm_context *ctx) return LLVMBuildSExt(ctx->builder, result, ctx->i32, ""); } +LLVMValueRef +ac_build_is_helper_invocation(struct ac_llvm_context *ctx) +{ + if (!ctx->postponed_kill) + return ac_build_load_helper_invocation(ctx); + + /* !(exact && postponed) */ + LLVMValueRef exact = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", + ctx->i1, NULL, 0, + AC_FUNC_ATTR_READNONE); + + LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, ""); + LLVMValueRef result = LLVMBuildAnd(ctx->builder, exact, postponed, ""); + + return LLVMBuildSelect(ctx->builder, result, ctx->i32_0, + LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), ""); +} + LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args, unsigned num_args) { @@ -4524,3 +4765,295 @@ ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, args->enabled_channels = mask; } +/* Send GS Alloc Req message from the first wave of the group to SPI. + * Message payload is: + * - bits 0..10: vertices in group + * - bits 12..22: primitives in group + */ +void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, + LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tmp; + bool export_dummy_prim = false; + + /* HW workaround for a GPU hang with 100% culling. + * We always have to export at least 1 primitive. + * Export a degenerate triangle using vertex 0 for all 3 vertices. + */ + if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) { + assert(vtx_cnt == ctx->i32_0); + prim_cnt = ctx->i32_1; + vtx_cnt = ctx->i32_1; + export_dummy_prim = true; + } + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); + + tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false),""); + tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); + ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); + + if (export_dummy_prim) { + struct ac_ngg_prim prim = {}; + /* The vertex indices are 0,0,0. */ + prim.passthrough = ctx->i32_0; + + struct ac_export_args pos = {}; + pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0; + pos.target = V_008DFC_SQ_EXP_POS; + pos.enabled_channels = 0xf; + pos.done = true; + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), + ctx->i32_0, ""), 5021); + ac_build_export_prim(ctx, &prim); + ac_build_export(ctx, &pos); + ac_build_endif(ctx, 5021); + } + + ac_build_endif(ctx, 5020); +} + +LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, + const struct ac_ngg_prim *prim) +{ + /* The prim export format is: + * - bits 0..8: index 0 + * - bit 9: edge flag 0 + * - bits 10..18: index 1 + * - bit 19: edge flag 1 + * - bits 20..28: index 2 + * - bit 29: edge flag 2 + * - bit 31: null primitive (skip) + */ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); + LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); + + for (unsigned i = 0; i < prim->num_vertices; ++i) { + tmp = LLVMBuildShl(builder, prim->index[i], + LLVMConstInt(ctx->i32, 10 * i, false), ""); + result = LLVMBuildOr(builder, result, tmp, ""); + tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, ""); + tmp = LLVMBuildShl(builder, tmp, + LLVMConstInt(ctx->i32, 10 * i + 9, false), ""); + result = LLVMBuildOr(builder, result, tmp, ""); + } + return result; +} + +void ac_build_export_prim(struct ac_llvm_context *ctx, + const struct ac_ngg_prim *prim) +{ + struct ac_export_args args; + + if (prim->passthrough) { + args.out[0] = prim->passthrough; + } else { + args.out[0] = ac_pack_prim_export(ctx, prim); + } + + args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); + args.out[1] = LLVMGetUndef(ctx->f32); + args.out[2] = LLVMGetUndef(ctx->f32); + args.out[3] = LLVMGetUndef(ctx->f32); + + args.target = V_008DFC_SQ_EXP_PRIM; + args.enabled_channels = 1; + args.done = true; + args.valid_mask = false; + args.compr = false; + + ac_build_export(ctx, &args); +} + +static LLVMTypeRef +arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) +{ + if (type == AC_ARG_FLOAT) { + return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size); + } else if (type == AC_ARG_INT) { + return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size); + } else { + LLVMTypeRef ptr_type; + switch (type) { + case AC_ARG_CONST_PTR: + ptr_type = ctx->i8; + break; + case AC_ARG_CONST_FLOAT_PTR: + ptr_type = ctx->f32; + break; + case AC_ARG_CONST_PTR_PTR: + ptr_type = ac_array_in_const32_addr_space(ctx->i8); + break; + case AC_ARG_CONST_DESC_PTR: + ptr_type = ctx->v4i32; + break; + case AC_ARG_CONST_IMAGE_PTR: + ptr_type = ctx->v8i32; + break; + default: + unreachable("unknown arg type"); + } + if (size == 1) { + return ac_array_in_const32_addr_space(ptr_type); + } else { + assert(size == 2); + return ac_array_in_const_addr_space(ptr_type); + } + } +} + +LLVMValueRef +ac_build_main(const struct ac_shader_args *args, + struct ac_llvm_context *ctx, + enum ac_llvm_calling_convention convention, + const char *name, LLVMTypeRef ret_type, + LLVMModuleRef module) +{ + LLVMTypeRef arg_types[AC_MAX_ARGS]; + + for (unsigned i = 0; i < args->arg_count; i++) { + arg_types[i] = arg_llvm_type(args->args[i].type, + args->args[i].size, ctx); + } + + LLVMTypeRef main_function_type = + LLVMFunctionType(ret_type, arg_types, args->arg_count, 0); + + LLVMValueRef main_function = + LLVMAddFunction(module, name, main_function_type); + LLVMBasicBlockRef main_function_body = + LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body"); + LLVMPositionBuilderAtEnd(ctx->builder, main_function_body); + + LLVMSetFunctionCallConv(main_function, convention); + for (unsigned i = 0; i < args->arg_count; ++i) { + LLVMValueRef P = LLVMGetParam(main_function, i); + + if (args->args[i].file != AC_ARG_SGPR) + continue; + + ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG); + + if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { + ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); + ac_add_attr_dereferenceable(P, UINT64_MAX); + ac_add_attr_alignment(P, 32); + } + } + + ctx->main_function = main_function; + + if (LLVM_VERSION_MAJOR >= 11) { + /* Enable denormals for FP16 and FP64: */ + LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", + "ieee,ieee"); + /* Disable denormals for FP32: */ + LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32", + "preserve-sign,preserve-sign"); + } + return main_function; +} + +void ac_build_s_endpgm(struct ac_llvm_context *ctx) +{ + LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false); + LLVMBuildCall(ctx->builder, code, NULL, 0, ""); +} + +LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, + LLVMValueRef mask, LLVMValueRef index) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMTypeRef type = LLVMTypeOf(mask); + + LLVMValueRef bit = LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), + LLVMBuildZExt(builder, index, type, ""), ""); + LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), ""); + LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, ""); + return ac_build_bit_count(ctx, prefix_mask); +} + +/* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */ +LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, + LLVMValueRef mask[2], LLVMValueRef index) +{ + LLVMBuilderRef builder = ctx->builder; +#if 0 + /* Reference version using i128. */ + LLVMValueRef input_mask = + LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, ""); + + return ac_prefix_bitcount(ctx, input_mask, index); +#else + /* Optimized version using 2 64-bit masks. */ + LLVMValueRef is_hi, is_0, c64, c128, all_bits; + LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2]; + + /* Compute the 128-bit prefix mask. */ + c64 = LLVMConstInt(ctx->i32, 64, 0); + c128 = LLVMConstInt(ctx->i32, 128, 0); + all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0); + /* The first index that can have non-zero high bits in the prefix mask is 65. */ + is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, ""); + is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, ""); + mask_bcnt0 = ac_build_bit_count(ctx, mask[0]); + + for (unsigned i = 0; i < 2; i++) { + shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, ""); + /* For i==0, index==0, the right shift by 64 doesn't give the desired result, + * so we handle it by the is_0 select. + * For i==1, index==64, same story, so we handle it by the last is_hi select. + * For i==0, index==64, we shift by 0, which is what we want. + */ + prefix_mask[i] = LLVMBuildLShr(builder, all_bits, + LLVMBuildZExt(builder, shift[i], ctx->i64, ""), ""); + prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], ""); + prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]); + } + + prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], ""); + prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], ""); + prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, ""); + + return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], ""); +#endif +} + +/** + * Convert triangle strip indices to triangle indices. This is used to decompose + * triangle strips into triangles. + */ +void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, + LLVMValueRef is_odd, + LLVMValueRef flatshade_first, + LLVMValueRef index[3]) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef out[3]; + + /* We need to change the vertex order for odd triangles to get correct + * front/back facing by swapping 2 vertex indices, but we also have to + * keep the provoking vertex in the same place. + * + * If the first vertex is provoking, swap index 1 and 2. + * If the last vertex is provoking, swap index 0 and 1. + */ + out[0] = LLVMBuildSelect(builder, flatshade_first, + index[0], + LLVMBuildSelect(builder, is_odd, + index[1], index[0], ""), ""); + out[1] = LLVMBuildSelect(builder, flatshade_first, + LLVMBuildSelect(builder, is_odd, + index[2], index[1], ""), + LLVMBuildSelect(builder, is_odd, + index[0], index[1], ""), ""); + out[2] = LLVMBuildSelect(builder, flatshade_first, + LLVMBuildSelect(builder, is_odd, + index[1], index[2], ""), + index[2], ""); + memcpy(index, out, sizeof(out)); +}