From: Pierre-Eric Pelloux-Prayer Date: Mon, 7 Sep 2020 07:56:01 +0000 (+0200) Subject: amd/llvm: switch to 3-spaces style X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=ab4a77bc744ae1c9b29450318beb09134134a8dc amd/llvm: switch to 3-spaces style Follow-up of !4319 using the same clang-format config. Acked-by: Samuel Pitoiset Acked-by: Marek Olšák Part-of: --- diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 79a5f8fbb2b..f1ab80e7f09 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -25,324 +25,294 @@ /* based on pieces from si_pipe.c and radeon_llvm_emit.c */ #include "ac_llvm_build.h" -#include -#include - -#include "c11/threads.h" - -#include -#include - +#include "ac_exp_param.h" #include "ac_llvm_util.h" #include "ac_shader_util.h" -#include "ac_exp_param.h" +#include "c11/threads.h" +#include "shader_enums.h" +#include "sid.h" #include "util/bitscan.h" #include "util/macros.h" #include "util/u_atomic.h" #include "util/u_math.h" -#include "sid.h" +#include +#include -#include "shader_enums.h" +#include +#include #define AC_LLVM_INITIAL_CF_DEPTH 4 /* Data for if/else/endif and bgnloop/endloop control flow structures. */ struct ac_llvm_flow { - /* Loop exit or next part of if/else/endif. */ - LLVMBasicBlockRef next_block; - LLVMBasicBlockRef loop_entry_block; + /* Loop exit or next part of if/else/endif. */ + LLVMBasicBlockRef next_block; + LLVMBasicBlockRef loop_entry_block; }; /* Initialize module-independent parts of the context. * * The caller is responsible for initializing ctx::module and ctx::builder. */ -void -ac_llvm_context_init(struct ac_llvm_context *ctx, - struct ac_llvm_compiler *compiler, - enum chip_class chip_class, enum radeon_family family, - enum ac_float_mode float_mode, unsigned wave_size, - unsigned ballot_mask_bits) -{ - ctx->context = LLVMContextCreate(); - - ctx->chip_class = chip_class; - ctx->family = family; - ctx->wave_size = wave_size; - ctx->ballot_mask_bits = ballot_mask_bits; - ctx->float_mode = float_mode; - ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32 - : compiler->tm, - ctx->context); - ctx->builder = ac_create_builder(ctx->context, float_mode); - - ctx->voidt = LLVMVoidTypeInContext(ctx->context); - ctx->i1 = LLVMInt1TypeInContext(ctx->context); - ctx->i8 = LLVMInt8TypeInContext(ctx->context); - ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); - ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); - ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); - ctx->i128 = LLVMIntTypeInContext(ctx->context, 128); - ctx->intptr = ctx->i32; - ctx->f16 = LLVMHalfTypeInContext(ctx->context); - ctx->f32 = LLVMFloatTypeInContext(ctx->context); - ctx->f64 = LLVMDoubleTypeInContext(ctx->context); - ctx->v2i16 = LLVMVectorType(ctx->i16, 2); - ctx->v4i16 = LLVMVectorType(ctx->i16, 4); - ctx->v2f16 = LLVMVectorType(ctx->f16, 2); - ctx->v4f16 = LLVMVectorType(ctx->f16, 4); - ctx->v2i32 = LLVMVectorType(ctx->i32, 2); - ctx->v3i32 = LLVMVectorType(ctx->i32, 3); - ctx->v4i32 = LLVMVectorType(ctx->i32, 4); - ctx->v2f32 = LLVMVectorType(ctx->f32, 2); - ctx->v3f32 = LLVMVectorType(ctx->f32, 3); - ctx->v4f32 = LLVMVectorType(ctx->f32, 4); - ctx->v8i32 = LLVMVectorType(ctx->i32, 8); - ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); - ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); - - ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); - ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); - ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); - ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); - ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); - ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); - ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); - ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); - ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false); - ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false); - ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); - ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); - ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); - ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); - ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); - ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); - - ctx->i1false = LLVMConstInt(ctx->i1, 0, false); - ctx->i1true = LLVMConstInt(ctx->i1, 1, false); - - ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, - "range", 5); - - ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, - "invariant.load", 14); - - ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, - "amdgpu.uniform", 14); - - ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); - ctx->flow = calloc(1, sizeof(*ctx->flow)); -} - -void -ac_llvm_context_dispose(struct ac_llvm_context *ctx) -{ - free(ctx->flow->stack); - free(ctx->flow); - ctx->flow = NULL; -} - -int -ac_get_llvm_num_components(LLVMValueRef value) -{ - LLVMTypeRef type = LLVMTypeOf(value); - unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind - ? LLVMGetVectorSize(type) - : 1; - return num_components; -} - -LLVMValueRef -ac_llvm_extract_elem(struct ac_llvm_context *ac, - LLVMValueRef value, - int index) -{ - if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { - assert(index == 0); - return value; - } - - return LLVMBuildExtractElement(ac->builder, value, - LLVMConstInt(ac->i32, index, false), ""); -} - -int -ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) -{ - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) - type = LLVMGetElementType(type); - - if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) - return LLVMGetIntTypeWidth(type); - - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { - if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS) - return 32; - } - - if (type == ctx->f16) - return 16; - if (type == ctx->f32) - return 32; - if (type == ctx->f64) - return 64; - - unreachable("Unhandled type kind in get_elem_bits"); -} - -unsigned -ac_get_type_size(LLVMTypeRef type) -{ - LLVMTypeKind kind = LLVMGetTypeKind(type); - - switch (kind) { - case LLVMIntegerTypeKind: - return LLVMGetIntTypeWidth(type) / 8; - case LLVMHalfTypeKind: - return 2; - case LLVMFloatTypeKind: - return 4; - case LLVMDoubleTypeKind: - return 8; - case LLVMPointerTypeKind: - if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) - return 4; - return 8; - case LLVMVectorTypeKind: - return LLVMGetVectorSize(type) * - ac_get_type_size(LLVMGetElementType(type)); - case LLVMArrayTypeKind: - return LLVMGetArrayLength(type) * - ac_get_type_size(LLVMGetElementType(type)); - default: - assert(0); - return 0; - } +void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler, + enum chip_class chip_class, enum radeon_family family, + enum ac_float_mode float_mode, unsigned wave_size, + unsigned ballot_mask_bits) +{ + ctx->context = LLVMContextCreate(); + + ctx->chip_class = chip_class; + ctx->family = family; + ctx->wave_size = wave_size; + ctx->ballot_mask_bits = ballot_mask_bits; + ctx->float_mode = float_mode; + ctx->module = + ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context); + ctx->builder = ac_create_builder(ctx->context, float_mode); + + ctx->voidt = LLVMVoidTypeInContext(ctx->context); + ctx->i1 = LLVMInt1TypeInContext(ctx->context); + ctx->i8 = LLVMInt8TypeInContext(ctx->context); + ctx->i16 = LLVMIntTypeInContext(ctx->context, 16); + ctx->i32 = LLVMIntTypeInContext(ctx->context, 32); + ctx->i64 = LLVMIntTypeInContext(ctx->context, 64); + ctx->i128 = LLVMIntTypeInContext(ctx->context, 128); + ctx->intptr = ctx->i32; + ctx->f16 = LLVMHalfTypeInContext(ctx->context); + ctx->f32 = LLVMFloatTypeInContext(ctx->context); + ctx->f64 = LLVMDoubleTypeInContext(ctx->context); + ctx->v2i16 = LLVMVectorType(ctx->i16, 2); + ctx->v4i16 = LLVMVectorType(ctx->i16, 4); + ctx->v2f16 = LLVMVectorType(ctx->f16, 2); + ctx->v4f16 = LLVMVectorType(ctx->f16, 4); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); + ctx->v3i32 = LLVMVectorType(ctx->i32, 3); + ctx->v4i32 = LLVMVectorType(ctx->i32, 4); + ctx->v2f32 = LLVMVectorType(ctx->f32, 2); + ctx->v3f32 = LLVMVectorType(ctx->f32, 3); + ctx->v4f32 = LLVMVectorType(ctx->f32, 4); + ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size); + ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits); + + ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false); + ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false); + ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false); + ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false); + ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false); + ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false); + ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false); + ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false); + ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false); + ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false); + ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0); + ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0); + ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0); + ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0); + ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0); + ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0); + + ctx->i1false = LLVMConstInt(ctx->i1, 0, false); + ctx->i1true = LLVMConstInt(ctx->i1, 1, false); + + ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5); + + ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14); + + ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14); + + ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0); + ctx->flow = calloc(1, sizeof(*ctx->flow)); +} + +void ac_llvm_context_dispose(struct ac_llvm_context *ctx) +{ + free(ctx->flow->stack); + free(ctx->flow); + ctx->flow = NULL; +} + +int ac_get_llvm_num_components(LLVMValueRef value) +{ + LLVMTypeRef type = LLVMTypeOf(value); + unsigned num_components = + LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1; + return num_components; +} + +LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index) +{ + if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) { + assert(index == 0); + return value; + } + + return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), ""); +} + +int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type) +{ + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) + type = LLVMGetElementType(type); + + if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind) + return LLVMGetIntTypeWidth(type); + + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS) + return 32; + } + + if (type == ctx->f16) + return 16; + if (type == ctx->f32) + return 32; + if (type == ctx->f64) + return 64; + + unreachable("Unhandled type kind in get_elem_bits"); +} + +unsigned ac_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMHalfTypeKind: + return 2; + case LLVMFloatTypeKind: + return 4; + case LLVMDoubleTypeKind: + return 8; + case LLVMPointerTypeKind: + if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT) + return 4; + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type)); + case LLVMArrayTypeKind: + return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; + } } static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) { - if (t == ctx->i8) - return ctx->i8; - else if (t == ctx->f16 || t == ctx->i16) - return ctx->i16; - else if (t == ctx->f32 || t == ctx->i32) - return ctx->i32; - else if (t == ctx->f64 || t == ctx->i64) - return ctx->i64; - else - unreachable("Unhandled integer size"); -} - -LLVMTypeRef -ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) -{ - if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { - LLVMTypeRef elem_type = LLVMGetElementType(t); - return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), - LLVMGetVectorSize(t)); - } - if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { - switch (LLVMGetPointerAddressSpace(t)) { - case AC_ADDR_SPACE_GLOBAL: - return ctx->i64; - case AC_ADDR_SPACE_CONST_32BIT: - case AC_ADDR_SPACE_LDS: - return ctx->i32; - default: - unreachable("unhandled address space"); - } - } - return to_integer_type_scalar(ctx, t); -} - -LLVMValueRef -ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) -{ - LLVMTypeRef type = LLVMTypeOf(v); - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { - return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); - } - return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); -} - -LLVMValueRef -ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) -{ - LLVMTypeRef type = LLVMTypeOf(v); - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) - return v; - return ac_to_integer(ctx, v); + if (t == ctx->i8) + return ctx->i8; + else if (t == ctx->f16 || t == ctx->i16) + return ctx->i16; + else if (t == ctx->f32 || t == ctx->i32) + return ctx->i32; + else if (t == ctx->f64 || t == ctx->i64) + return ctx->i64; + else + unreachable("Unhandled integer size"); } -static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) +LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(t); + return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t)); + } + if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) { + switch (LLVMGetPointerAddressSpace(t)) { + case AC_ADDR_SPACE_GLOBAL: + return ctx->i64; + case AC_ADDR_SPACE_CONST_32BIT: + case AC_ADDR_SPACE_LDS: + return ctx->i32; + default: + unreachable("unhandled address space"); + } + } + return to_integer_type_scalar(ctx, t); +} + +LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v) { - if (t == ctx->i8) - return ctx->i8; - else if (t == ctx->i16 || t == ctx->f16) - return ctx->f16; - else if (t == ctx->i32 || t == ctx->f32) - return ctx->f32; - else if (t == ctx->i64 || t == ctx->f64) - return ctx->f64; - else - unreachable("Unhandled float size"); + LLVMTypeRef type = LLVMTypeOf(v); + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), ""); + } + return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), ""); } -LLVMTypeRef -ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v) { - if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { - LLVMTypeRef elem_type = LLVMGetElementType(t); - return LLVMVectorType(to_float_type_scalar(ctx, elem_type), - LLVMGetVectorSize(t)); - } - return to_float_type_scalar(ctx, t); + LLVMTypeRef type = LLVMTypeOf(v); + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) + return v; + return ac_to_integer(ctx, v); } -LLVMValueRef -ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) +static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t) { - LLVMTypeRef type = LLVMTypeOf(v); - return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); + if (t == ctx->i8) + return ctx->i8; + else if (t == ctx->i16 || t == ctx->f16) + return ctx->f16; + else if (t == ctx->i32 || t == ctx->f32) + return ctx->f32; + else if (t == ctx->i64 || t == ctx->f64) + return ctx->f64; + else + unreachable("Unhandled float size"); } +LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t) +{ + if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) { + LLVMTypeRef elem_type = LLVMGetElementType(t); + return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t)); + } + return to_float_type_scalar(ctx, t); +} -LLVMValueRef -ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, - LLVMTypeRef return_type, LLVMValueRef *params, - unsigned param_count, unsigned attrib_mask) +LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v) { - LLVMValueRef function, call; - bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); + LLVMTypeRef type = LLVMTypeOf(v); + return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), ""); +} - function = LLVMGetNamedFunction(ctx->module, name); - if (!function) { - LLVMTypeRef param_types[32], function_type; - unsigned i; +LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, + LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count, + unsigned attrib_mask) +{ + LLVMValueRef function, call; + bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY); - assert(param_count <= 32); + function = LLVMGetNamedFunction(ctx->module, name); + if (!function) { + LLVMTypeRef param_types[32], function_type; + unsigned i; - for (i = 0; i < param_count; ++i) { - assert(params[i]); - param_types[i] = LLVMTypeOf(params[i]); - } - function_type = - LLVMFunctionType(return_type, param_types, param_count, 0); - function = LLVMAddFunction(ctx->module, name, function_type); + assert(param_count <= 32); - LLVMSetFunctionCallConv(function, LLVMCCallConv); - LLVMSetLinkage(function, LLVMExternalLinkage); + for (i = 0; i < param_count; ++i) { + assert(params[i]); + param_types[i] = LLVMTypeOf(params[i]); + } + function_type = LLVMFunctionType(return_type, param_types, param_count, 0); + function = LLVMAddFunction(ctx->module, name, function_type); - if (!set_callsite_attrs) - ac_add_func_attributes(ctx->context, function, attrib_mask); - } + LLVMSetFunctionCallConv(function, LLVMCCallConv); + LLVMSetLinkage(function, LLVMExternalLinkage); - call = LLVMBuildCall(ctx->builder, function, params, param_count, ""); - if (set_callsite_attrs) - ac_add_func_attributes(ctx->context, call, attrib_mask); - return call; + if (!set_callsite_attrs) + ac_add_func_attributes(ctx->context, function, attrib_mask); + } + + call = LLVMBuildCall(ctx->builder, function, params, param_count, ""); + if (set_callsite_attrs) + ac_add_func_attributes(ctx->context, call, attrib_mask); + return call; } /** @@ -351,59 +321,55 @@ ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, */ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize) { - LLVMTypeRef elem_type = type; - - assert(bufsize >= 8); - - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { - int ret = snprintf(buf, bufsize, "v%u", - LLVMGetVectorSize(type)); - if (ret < 0) { - char *type_name = LLVMPrintTypeToString(type); - fprintf(stderr, "Error building type name for: %s\n", - type_name); - LLVMDisposeMessage(type_name); - return; - } - elem_type = LLVMGetElementType(type); - buf += ret; - bufsize -= ret; - } - switch (LLVMGetTypeKind(elem_type)) { - default: break; - case LLVMIntegerTypeKind: - snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); - break; - case LLVMHalfTypeKind: - snprintf(buf, bufsize, "f16"); - break; - case LLVMFloatTypeKind: - snprintf(buf, bufsize, "f32"); - break; - case LLVMDoubleTypeKind: - snprintf(buf, bufsize, "f64"); - break; - } + LLVMTypeRef elem_type = type; + + assert(bufsize >= 8); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type)); + if (ret < 0) { + char *type_name = LLVMPrintTypeToString(type); + fprintf(stderr, "Error building type name for: %s\n", type_name); + LLVMDisposeMessage(type_name); + return; + } + elem_type = LLVMGetElementType(type); + buf += ret; + bufsize -= ret; + } + switch (LLVMGetTypeKind(elem_type)) { + default: + break; + case LLVMIntegerTypeKind: + snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); + break; + case LLVMHalfTypeKind: + snprintf(buf, bufsize, "f16"); + break; + case LLVMFloatTypeKind: + snprintf(buf, bufsize, "f32"); + break; + case LLVMDoubleTypeKind: + snprintf(buf, bufsize, "f64"); + break; + } } /** * Helper function that builds an LLVM IR PHI node and immediately adds * incoming edges. */ -LLVMValueRef -ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, - unsigned count_incoming, LLVMValueRef *values, - LLVMBasicBlockRef *blocks) +LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming, + LLVMValueRef *values, LLVMBasicBlockRef *blocks) { - LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); - LLVMAddIncoming(phi, values, blocks, count_incoming); - return phi; + LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, ""); + LLVMAddIncoming(phi, values, blocks, count_incoming); + return phi; } void ac_build_s_barrier(struct ac_llvm_context *ctx) { - ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, - 0, AC_FUNC_ATTR_CONVERGENT); + ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT); } /* Prevent optimizations (at least of memory accesses) across the current @@ -413,375 +379,328 @@ void ac_build_s_barrier(struct ac_llvm_context *ctx) * Optionally, a value can be passed through the inline assembly to prevent * LLVM from hoisting calls to ReadNone functions. */ -void -ac_build_optimization_barrier(struct ac_llvm_context *ctx, - LLVMValueRef *pvgpr) -{ - static int counter = 0; - - LLVMBuilderRef builder = ctx->builder; - char code[16]; - - snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter)); - - if (!pvgpr) { - LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); - LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); - LLVMBuildCall(builder, inlineasm, NULL, 0, ""); - } else { - LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); - LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); - LLVMTypeRef type = LLVMTypeOf(*pvgpr); - unsigned bitsize = ac_get_elem_bits(ctx, type); - LLVMValueRef vgpr = *pvgpr; - LLVMTypeRef vgpr_type; - unsigned vgpr_size; - LLVMValueRef vgpr0; +void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr) +{ + static int counter = 0; + + LLVMBuilderRef builder = ctx->builder; + char code[16]; + + snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter)); - if (bitsize < 32) - vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, ""); + if (!pvgpr) { + LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false); + LLVMBuildCall(builder, inlineasm, NULL, 0, ""); + } else { + LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false); + LLVMTypeRef type = LLVMTypeOf(*pvgpr); + unsigned bitsize = ac_get_elem_bits(ctx, type); + LLVMValueRef vgpr = *pvgpr; + LLVMTypeRef vgpr_type; + unsigned vgpr_size; + LLVMValueRef vgpr0; - vgpr_type = LLVMTypeOf(vgpr); - vgpr_size = ac_get_type_size(vgpr_type); + if (bitsize < 32) + vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, ""); - assert(vgpr_size % 4 == 0); + vgpr_type = LLVMTypeOf(vgpr); + vgpr_size = ac_get_type_size(vgpr_type); - vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); - vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); - vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); - vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); - vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); + assert(vgpr_size % 4 == 0); - if (bitsize < 32) - vgpr = LLVMBuildTrunc(builder, vgpr, type, ""); + vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), ""); + vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, ""); + vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, ""); + vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, ""); + vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, ""); - *pvgpr = vgpr; - } + if (bitsize < 32) + vgpr = LLVMBuildTrunc(builder, vgpr, type, ""); + + *pvgpr = vgpr; + } } -LLVMValueRef -ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope) +LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope) { - const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime"; - LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0); - return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); + const char *name = + scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime"; + LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0); + return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, ""); } -LLVMValueRef -ac_build_ballot(struct ac_llvm_context *ctx, - LLVMValueRef value) +LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value) { - const char *name; + const char *name; - if (LLVM_VERSION_MAJOR >= 9) { - if (ctx->wave_size == 64) - name = "llvm.amdgcn.icmp.i64.i32"; - else - name = "llvm.amdgcn.icmp.i32.i32"; - } else { - name = "llvm.amdgcn.icmp.i32"; - } - LLVMValueRef args[3] = { - value, - ctx->i32_0, - LLVMConstInt(ctx->i32, LLVMIntNE, 0) - }; + if (LLVM_VERSION_MAJOR >= 9) { + if (ctx->wave_size == 64) + name = "llvm.amdgcn.icmp.i64.i32"; + else + name = "llvm.amdgcn.icmp.i32.i32"; + } else { + name = "llvm.amdgcn.icmp.i32"; + } + LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)}; - /* We currently have no other way to prevent LLVM from lifting the icmp - * calls to a dominating basic block. - */ - ac_build_optimization_barrier(ctx, &args[0]); + /* We currently have no other way to prevent LLVM from lifting the icmp + * calls to a dominating basic block. + */ + ac_build_optimization_barrier(ctx, &args[0]); - args[0] = ac_to_integer(ctx, args[0]); + args[0] = ac_to_integer(ctx, args[0]); - return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, - AC_FUNC_ATTR_NOUNWIND | - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); + return ac_build_intrinsic( + ctx, name, ctx->iN_wavemask, args, 3, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); } -LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, - LLVMValueRef value) +LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value) { - const char *name; - - if (LLVM_VERSION_MAJOR >= 9) { - if (ctx->wave_size == 64) - name = "llvm.amdgcn.icmp.i64.i1"; - else - name = "llvm.amdgcn.icmp.i32.i1"; - } else { - name = "llvm.amdgcn.icmp.i1"; - } - LLVMValueRef args[3] = { - value, - ctx->i1false, - LLVMConstInt(ctx->i32, LLVMIntNE, 0), - }; + const char *name; + + if (LLVM_VERSION_MAJOR >= 9) { + if (ctx->wave_size == 64) + name = "llvm.amdgcn.icmp.i64.i1"; + else + name = "llvm.amdgcn.icmp.i32.i1"; + } else { + name = "llvm.amdgcn.icmp.i1"; + } + LLVMValueRef args[3] = { + value, + ctx->i1false, + LLVMConstInt(ctx->i32, LLVMIntNE, 0), + }; - return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, - AC_FUNC_ATTR_NOUNWIND | - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); + return ac_build_intrinsic( + ctx, name, ctx->iN_wavemask, args, 3, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); } -LLVMValueRef -ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) +LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value) { - LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); - LLVMValueRef vote_set = ac_build_ballot(ctx, value); - return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); } -LLVMValueRef -ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) +LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value) { - LLVMValueRef vote_set = ac_build_ballot(ctx, value); - return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, - LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); + return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), + ""); } -LLVMValueRef -ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) +LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) { - LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); - LLVMValueRef vote_set = ac_build_ballot(ctx, value); + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + LLVMValueRef vote_set = ac_build_ballot(ctx, value); - LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - vote_set, active_set, ""); - LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - vote_set, - LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); - return LLVMBuildOr(ctx->builder, all, none, ""); + LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, ""); + LLVMValueRef none = + LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), ""); + return LLVMBuildOr(ctx->builder, all, none, ""); } -LLVMValueRef -ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, - unsigned value_count, unsigned component) +LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count, unsigned component) { - LLVMValueRef vec = NULL; + LLVMValueRef vec = NULL; - if (value_count == 1) { - return values[component]; - } else if (!value_count) - unreachable("value_count is 0"); + if (value_count == 1) { + return values[component]; + } else if (!value_count) + unreachable("value_count is 0"); - for (unsigned i = component; i < value_count + component; i++) { - LLVMValueRef value = values[i]; + for (unsigned i = component; i < value_count + component; i++) { + LLVMValueRef value = values[i]; - if (i == component) - vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); - LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); - vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); - } - return vec; + if (i == component) + vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count)); + LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false); + vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, ""); + } + return vec; } -LLVMValueRef -ac_build_gather_values_extended(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count, - unsigned value_stride, - bool load, - bool always_vector) +LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count, unsigned value_stride, bool load, + bool always_vector) { - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef vec = NULL; - unsigned i; + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef vec = NULL; + unsigned i; - if (value_count == 1 && !always_vector) { - if (load) - return LLVMBuildLoad(builder, values[0], ""); - return values[0]; - } else if (!value_count) - unreachable("value_count is 0"); + if (value_count == 1 && !always_vector) { + if (load) + return LLVMBuildLoad(builder, values[0], ""); + return values[0]; + } else if (!value_count) + unreachable("value_count is 0"); - for (i = 0; i < value_count; i++) { - LLVMValueRef value = values[i * value_stride]; - if (load) - value = LLVMBuildLoad(builder, value, ""); + for (i = 0; i < value_count; i++) { + LLVMValueRef value = values[i * value_stride]; + if (load) + value = LLVMBuildLoad(builder, value, ""); - if (!i) - vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count)); - LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); - vec = LLVMBuildInsertElement(builder, vec, value, index, ""); - } - return vec; + if (!i) + vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count)); + LLVMValueRef index = LLVMConstInt(ctx->i32, i, false); + vec = LLVMBuildInsertElement(builder, vec, value, index, ""); + } + return vec; } -LLVMValueRef -ac_build_gather_values(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count) +LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count) { - return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); + return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); } /* Expand a scalar or vector to by filling the remaining * channels with undef. Extract at most src_channels components from the input. */ -static LLVMValueRef -ac_build_expand(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned src_channels, - unsigned dst_channels) +static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value, + unsigned src_channels, unsigned dst_channels) { - LLVMTypeRef elemtype; - LLVMValueRef chan[dst_channels]; + LLVMTypeRef elemtype; + LLVMValueRef chan[dst_channels]; - if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { - unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { + unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); - if (src_channels == dst_channels && vec_size == dst_channels) - return value; + if (src_channels == dst_channels && vec_size == dst_channels) + return value; - src_channels = MIN2(src_channels, vec_size); + src_channels = MIN2(src_channels, vec_size); - for (unsigned i = 0; i < src_channels; i++) - chan[i] = ac_llvm_extract_elem(ctx, value, i); + for (unsigned i = 0; i < src_channels; i++) + chan[i] = ac_llvm_extract_elem(ctx, value, i); - elemtype = LLVMGetElementType(LLVMTypeOf(value)); - } else { - if (src_channels) { - assert(src_channels == 1); - chan[0] = value; - } - elemtype = LLVMTypeOf(value); - } + elemtype = LLVMGetElementType(LLVMTypeOf(value)); + } else { + if (src_channels) { + assert(src_channels == 1); + chan[0] = value; + } + elemtype = LLVMTypeOf(value); + } - for (unsigned i = src_channels; i < dst_channels; i++) - chan[i] = LLVMGetUndef(elemtype); + for (unsigned i = src_channels; i < dst_channels; i++) + chan[i] = LLVMGetUndef(elemtype); - return ac_build_gather_values(ctx, chan, dst_channels); + return ac_build_gather_values(ctx, chan, dst_channels); } /* Extract components [start, start + channels) from a vector. */ -LLVMValueRef -ac_extract_components(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned start, - unsigned channels) +LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start, + unsigned channels) { - LLVMValueRef chan[channels]; + LLVMValueRef chan[channels]; - for (unsigned i = 0; i < channels; i++) - chan[i] = ac_llvm_extract_elem(ctx, value, i + start); + for (unsigned i = 0; i < channels; i++) + chan[i] = ac_llvm_extract_elem(ctx, value, i + start); - return ac_build_gather_values(ctx, chan, channels); + return ac_build_gather_values(ctx, chan, channels); } /* Expand a scalar or vector to <4 x type> by filling the remaining channels * with undef. Extract at most num_channels components from the input. */ -LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned num_channels) +LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value, + unsigned num_channels) { - return ac_build_expand(ctx, value, num_channels, 4); + return ac_build_expand(ctx, value, num_channels, 4); } LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value) { - unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); - const char *name; + unsigned type_size = ac_get_type_size(LLVMTypeOf(value)); + const char *name; - if (type_size == 2) - name = "llvm.rint.f16"; - else if (type_size == 4) - name = "llvm.rint.f32"; - else - name = "llvm.rint.f64"; + if (type_size == 2) + name = "llvm.rint.f16"; + else if (type_size == 4) + name = "llvm.rint.f32"; + else + name = "llvm.rint.f64"; - return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, - AC_FUNC_ATTR_READNONE); + return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE); } -LLVMValueRef -ac_build_fdiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef den) +LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den) { - unsigned type_size = ac_get_type_size(LLVMTypeOf(den)); - const char *name; + unsigned type_size = ac_get_type_size(LLVMTypeOf(den)); + const char *name; - /* For doubles, we need precise division to pass GLCTS. */ - if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && - type_size == 8) - return LLVMBuildFDiv(ctx->builder, num, den, ""); + /* For doubles, we need precise division to pass GLCTS. */ + if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8) + return LLVMBuildFDiv(ctx->builder, num, den, ""); - if (type_size == 2) - name = "llvm.amdgcn.rcp.f16"; - else if (type_size == 4) - name = "llvm.amdgcn.rcp.f32"; - else - name = "llvm.amdgcn.rcp.f64"; + if (type_size == 2) + name = "llvm.amdgcn.rcp.f16"; + else if (type_size == 4) + name = "llvm.amdgcn.rcp.f32"; + else + name = "llvm.amdgcn.rcp.f64"; - LLVMValueRef rcp = ac_build_intrinsic(ctx, name, LLVMTypeOf(den), - &den, 1, AC_FUNC_ATTR_READNONE); + LLVMValueRef rcp = + ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE); - return LLVMBuildFMul(ctx->builder, num, rcp, ""); + return LLVMBuildFMul(ctx->builder, num, rcp, ""); } /* See fast_idiv_by_const.h. */ /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */ -LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment) -{ - LLVMBuilderRef builder = ctx->builder; - - num = LLVMBuildLShr(builder, num, pre_shift, ""); - num = LLVMBuildMul(builder, - LLVMBuildZExt(builder, num, ctx->i64, ""), - LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); - num = LLVMBuildAdd(builder, num, - LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); - num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); - num = LLVMBuildTrunc(builder, num, ctx->i32, ""); - return LLVMBuildLShr(builder, num, post_shift, ""); +LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num, + LLVMValueRef multiplier, LLVMValueRef pre_shift, + LLVMValueRef post_shift, LLVMValueRef increment) +{ + LLVMBuilderRef builder = ctx->builder; + + num = LLVMBuildLShr(builder, num, pre_shift, ""); + num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), + LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); + num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), ""); + num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); + num = LLVMBuildTrunc(builder, num, ctx->i32, ""); + return LLVMBuildLShr(builder, num, post_shift, ""); } /* See fast_idiv_by_const.h. */ /* If num != UINT_MAX, this more efficient version can be used. */ /* Set: increment = util_fast_udiv_info::increment; */ -LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment) +LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num, + LLVMValueRef multiplier, LLVMValueRef pre_shift, + LLVMValueRef post_shift, LLVMValueRef increment) { - LLVMBuilderRef builder = ctx->builder; + LLVMBuilderRef builder = ctx->builder; - num = LLVMBuildLShr(builder, num, pre_shift, ""); - num = LLVMBuildNUWAdd(builder, num, increment, ""); - num = LLVMBuildMul(builder, - LLVMBuildZExt(builder, num, ctx->i64, ""), - LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); - num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); - num = LLVMBuildTrunc(builder, num, ctx->i32, ""); - return LLVMBuildLShr(builder, num, post_shift, ""); + num = LLVMBuildLShr(builder, num, pre_shift, ""); + num = LLVMBuildNUWAdd(builder, num, increment, ""); + num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), + LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); + num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); + num = LLVMBuildTrunc(builder, num, ctx->i32, ""); + return LLVMBuildLShr(builder, num, post_shift, ""); } /* See fast_idiv_by_const.h. */ /* Both operands must fit in 31 bits and the divisor must not be 1. */ -LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef post_shift) +LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num, + LLVMValueRef multiplier, LLVMValueRef post_shift) { - LLVMBuilderRef builder = ctx->builder; + LLVMBuilderRef builder = ctx->builder; - num = LLVMBuildMul(builder, - LLVMBuildZExt(builder, num, ctx->i64, ""), - LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); - num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); - num = LLVMBuildTrunc(builder, num, ctx->i32, ""); - return LLVMBuildLShr(builder, num, post_shift, ""); + num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""), + LLVMBuildZExt(builder, multiplier, ctx->i64, ""), ""); + num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), ""); + num = LLVMBuildTrunc(builder, num, ctx->i32, ""); + return LLVMBuildLShr(builder, num, post_shift, ""); } /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27 @@ -789,26 +708,20 @@ LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, * already multiplied by two. id is the cube face number. */ struct cube_selection_coords { - LLVMValueRef stc[2]; - LLVMValueRef ma; - LLVMValueRef id; + LLVMValueRef stc[2]; + LLVMValueRef ma; + LLVMValueRef id; }; -static void -build_cube_intrinsic(struct ac_llvm_context *ctx, - LLVMValueRef in[3], - struct cube_selection_coords *out) +static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3], + struct cube_selection_coords *out) { - LLVMTypeRef f32 = ctx->f32; + LLVMTypeRef f32 = ctx->f32; - out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", - f32, in, 3, AC_FUNC_ATTR_READNONE); - out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", - f32, in, 3, AC_FUNC_ATTR_READNONE); + out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE); + out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE); + out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE); + out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE); } /** @@ -821,281 +734,250 @@ build_cube_intrinsic(struct ac_llvm_context *ctx, * the selcoords major axis. */ static void build_cube_select(struct ac_llvm_context *ctx, - const struct cube_selection_coords *selcoords, - const LLVMValueRef *coords, - LLVMValueRef *out_st, - LLVMValueRef *out_ma) -{ - LLVMBuilderRef builder = ctx->builder; - LLVMTypeRef f32 = LLVMTypeOf(coords[0]); - LLVMValueRef is_ma_positive; - LLVMValueRef sgn_ma; - LLVMValueRef is_ma_z, is_not_ma_z; - LLVMValueRef is_ma_y; - LLVMValueRef is_ma_x; - LLVMValueRef sgn; - LLVMValueRef tmp; - - is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, - selcoords->ma, LLVMConstReal(f32, 0.0), ""); - sgn_ma = LLVMBuildSelect(builder, is_ma_positive, - LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), ""); - - is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); - is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); - is_ma_y = LLVMBuildAnd(builder, is_not_ma_z, - LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); - is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); - - /* Select sc */ - tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); - sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0), - LLVMBuildSelect(builder, is_ma_z, sgn_ma, - LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); - out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); - - /* Select tc */ - tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); - sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, - LLVMConstReal(f32, -1.0), ""); - out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); - - /* Select ma */ - tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], - LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); - tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", - ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); - *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); -} - -void -ac_prepare_cube_coords(struct ac_llvm_context *ctx, - bool is_deriv, bool is_array, bool is_lod, - LLVMValueRef *coords_arg, - LLVMValueRef *derivs_arg) -{ - - LLVMBuilderRef builder = ctx->builder; - struct cube_selection_coords selcoords; - LLVMValueRef coords[3]; - LLVMValueRef invma; - - if (is_array && !is_lod) { - LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); - - /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: - * - * "For Array forms, the array layer used will be - * - * max(0, min(d−1, floor(layer+0.5))) - * - * where d is the depth of the texture array and layer - * comes from the component indicated in the tables below. - * Workaroudn for an issue where the layer is taken from a - * helper invocation which happens to fall on a different - * layer due to extrapolation." - * - * GFX8 and earlier attempt to implement this in hardware by - * clamping the value of coords[2] = (8 * layer) + face. - * Unfortunately, this means that the we end up with the wrong - * face when clamping occurs. - * - * Clamp the layer earlier to work around the issue. - */ - if (ctx->chip_class <= GFX8) { - LLVMValueRef ge0; - ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); - tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); - } - - coords_arg[3] = tmp; - } - - build_cube_intrinsic(ctx, coords_arg, &selcoords); - - invma = ac_build_intrinsic(ctx, "llvm.fabs.f32", - ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); - invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); - - for (int i = 0; i < 2; ++i) - coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); - - coords[2] = selcoords.id; - - if (is_deriv && derivs_arg) { - LLVMValueRef derivs[4]; - int axis; - - /* Convert cube derivatives to 2D derivatives. */ - for (axis = 0; axis < 2; axis++) { - LLVMValueRef deriv_st[2]; - LLVMValueRef deriv_ma; - - /* Transform the derivative alongside the texture - * coordinate. Mathematically, the correct formula is - * as follows. Assume we're projecting onto the +Z face - * and denote by dx/dh the derivative of the (original) - * X texture coordinate with respect to horizontal - * window coordinates. The projection onto the +Z face - * plane is: - * - * f(x,z) = x/z - * - * Then df/dh = df/dx * dx/dh + df/dz * dz/dh - * = 1/z * dx/dh - x/z * 1/z * dz/dh. - * - * This motivatives the implementation below. - * - * Whether this actually gives the expected results for - * apps that might feed in derivatives obtained via - * finite differences is anyone's guess. The OpenGL spec - * seems awfully quiet about how textureGrad for cube - * maps should be handled. - */ - build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], - deriv_st, &deriv_ma); - - deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); - - for (int i = 0; i < 2; ++i) - derivs[axis * 2 + i] = - LLVMBuildFSub(builder, - LLVMBuildFMul(builder, deriv_st[i], invma, ""), - LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); - } - - memcpy(derivs_arg, derivs, sizeof(derivs)); - } - - /* Shift the texture coordinate. This must be applied after the - * derivative calculation. - */ - for (int i = 0; i < 2; ++i) - coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); - - if (is_array) { - /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ - /* coords_arg.w component - array_index for cube arrays */ - coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); - } - - memcpy(coords_arg, coords, sizeof(coords)); -} - - -LLVMValueRef -ac_build_fs_interp(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j) -{ - LLVMValueRef args[5]; - LLVMValueRef p1; - - args[0] = i; - args[1] = llvm_chan; - args[2] = attr_number; - args[3] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); - - args[0] = p1; - args[1] = j; - args[2] = llvm_chan; - args[3] = attr_number; - args[4] = params; + const struct cube_selection_coords *selcoords, + const LLVMValueRef *coords, LLVMValueRef *out_st, + LLVMValueRef *out_ma) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMTypeRef f32 = LLVMTypeOf(coords[0]); + LLVMValueRef is_ma_positive; + LLVMValueRef sgn_ma; + LLVMValueRef is_ma_z, is_not_ma_z; + LLVMValueRef is_ma_y; + LLVMValueRef is_ma_x; + LLVMValueRef sgn; + LLVMValueRef tmp; + + is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), ""); + sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0), + LLVMConstReal(f32, -1.0), ""); + + is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), ""); + is_not_ma_z = LLVMBuildNot(builder, is_ma_z, ""); + is_ma_y = LLVMBuildAnd( + builder, is_not_ma_z, + LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), ""); + is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), ""); + + /* Select sc */ + tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], ""); + sgn = LLVMBuildSelect( + builder, is_ma_y, LLVMConstReal(f32, 1.0), + LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), ""); + out_st[0] = LLVMBuildFMul(builder, tmp, sgn, ""); + + /* Select tc */ + tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], ""); + sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), ""); + out_st[1] = LLVMBuildFMul(builder, tmp, sgn, ""); + + /* Select ma */ + tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], + LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), ""); + tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE); + *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), ""); +} + +void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod, + LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg) +{ + + LLVMBuilderRef builder = ctx->builder; + struct cube_selection_coords selcoords; + LLVMValueRef coords[3]; + LLVMValueRef invma; + + if (is_array && !is_lod) { + LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]); + + /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says: + * + * "For Array forms, the array layer used will be + * + * max(0, min(d−1, floor(layer+0.5))) + * + * where d is the depth of the texture array and layer + * comes from the component indicated in the tables below. + * Workaroudn for an issue where the layer is taken from a + * helper invocation which happens to fall on a different + * layer due to extrapolation." + * + * GFX8 and earlier attempt to implement this in hardware by + * clamping the value of coords[2] = (8 * layer) + face. + * Unfortunately, this means that the we end up with the wrong + * face when clamping occurs. + * + * Clamp the layer earlier to work around the issue. + */ + if (ctx->chip_class <= GFX8) { + LLVMValueRef ge0; + ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, ""); + tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, ""); + } + + coords_arg[3] = tmp; + } + + build_cube_intrinsic(ctx, coords_arg, &selcoords); + + invma = + ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE); + invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma); + + for (int i = 0; i < 2; ++i) + coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, ""); + + coords[2] = selcoords.id; + + if (is_deriv && derivs_arg) { + LLVMValueRef derivs[4]; + int axis; + + /* Convert cube derivatives to 2D derivatives. */ + for (axis = 0; axis < 2; axis++) { + LLVMValueRef deriv_st[2]; + LLVMValueRef deriv_ma; + + /* Transform the derivative alongside the texture + * coordinate. Mathematically, the correct formula is + * as follows. Assume we're projecting onto the +Z face + * and denote by dx/dh the derivative of the (original) + * X texture coordinate with respect to horizontal + * window coordinates. The projection onto the +Z face + * plane is: + * + * f(x,z) = x/z + * + * Then df/dh = df/dx * dx/dh + df/dz * dz/dh + * = 1/z * dx/dh - x/z * 1/z * dz/dh. + * + * This motivatives the implementation below. + * + * Whether this actually gives the expected results for + * apps that might feed in derivatives obtained via + * finite differences is anyone's guess. The OpenGL spec + * seems awfully quiet about how textureGrad for cube + * maps should be handled. + */ + build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma); + + deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, ""); + + for (int i = 0; i < 2; ++i) + derivs[axis * 2 + i] = + LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""), + LLVMBuildFMul(builder, deriv_ma, coords[i], ""), ""); + } + + memcpy(derivs_arg, derivs, sizeof(derivs)); + } + + /* Shift the texture coordinate. This must be applied after the + * derivative calculation. + */ + for (int i = 0; i < 2; ++i) + coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), ""); + + if (is_array) { + /* for cube arrays coord.z = coord.w(array_index) * 8 + face */ + /* coords_arg.w component - array_index for cube arrays */ + coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]); + } - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + memcpy(coords_arg, coords, sizeof(coords)); } -LLVMValueRef -ac_build_fs_interp_f16(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j) +LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, + LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, + LLVMValueRef j) { - LLVMValueRef args[6]; - LLVMValueRef p1; - - args[0] = i; - args[1] = llvm_chan; - args[2] = attr_number; - args[3] = ctx->i1false; - args[4] = params; - - p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); - - args[0] = p1; - args[1] = j; - args[2] = llvm_chan; - args[3] = attr_number; - args[4] = ctx->i1false; - args[5] = params; + LLVMValueRef args[5]; + LLVMValueRef p1; + + args[0] = i; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = params; + + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", - ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); + args[0] = p1; + args[1] = j; + args[2] = llvm_chan; + args[3] = attr_number; + args[4] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5, + AC_FUNC_ATTR_READNONE); } -LLVMValueRef -ac_build_fs_interp_mov(struct ac_llvm_context *ctx, - LLVMValueRef parameter, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params) +LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, + LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, + LLVMValueRef j) { - LLVMValueRef args[4]; + LLVMValueRef args[6]; + LLVMValueRef p1; + + args[0] = i; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = ctx->i1false; + args[4] = params; + + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5, + AC_FUNC_ATTR_READNONE); - args[0] = parameter; - args[1] = llvm_chan; - args[2] = attr_number; - args[3] = params; + args[0] = p1; + args[1] = j; + args[2] = llvm_chan; + args[3] = attr_number; + args[4] = ctx->i1false; + args[5] = params; - return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", - ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6, + AC_FUNC_ATTR_READNONE); } -LLVMValueRef -ac_build_gep_ptr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index) +LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, + LLVMValueRef llvm_chan, LLVMValueRef attr_number, + LLVMValueRef params) { - return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); + LLVMValueRef args[4]; + + args[0] = parameter; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, + AC_FUNC_ATTR_READNONE); } -LLVMValueRef -ac_build_gep0(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index) +LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index) { - LLVMValueRef indices[2] = { - ctx->i32_0, - index, - }; - return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); + return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); } -LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef index) +LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - return LLVMBuildPointerCast(ctx->builder, - LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""), - LLVMTypeOf(ptr), ""); + LLVMValueRef indices[2] = { + ctx->i32_0, + index, + }; + return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); } -void -ac_build_indexed_store(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - LLVMValueRef value) +LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index) { - LLVMBuildStore(ctx->builder, value, - ac_build_gep0(ctx, base_ptr, index)); + return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""), + LLVMTypeOf(ptr), ""); +} + +void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, + LLVMValueRef value) +{ + LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index)); } /** @@ -1126,425 +1008,328 @@ ac_build_indexed_store(struct ac_llvm_context *ctx, * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds */ -static LLVMValueRef -ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index, bool uniform, bool invariant, - bool no_unsigned_wraparound) +static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index, bool uniform, bool invariant, + bool no_unsigned_wraparound) { - LLVMValueRef pointer, result; + LLVMValueRef pointer, result; - if (no_unsigned_wraparound && - LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) - pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, ""); - else - pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); + if (no_unsigned_wraparound && + LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT) + pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, ""); + else + pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); - if (uniform) - LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); - result = LLVMBuildLoad(ctx->builder, pointer, ""); - if (invariant) - LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); - return result; + if (uniform) + LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); + result = LLVMBuildLoad(ctx->builder, pointer, ""); + if (invariant) + LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); + return result; } -LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index) +LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, false, false, false); + return ac_build_load_custom(ctx, base_ptr, index, false, false, false); } -LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) +LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, false, true, false); + return ac_build_load_custom(ctx, base_ptr, index, false, true, false); } /* This assumes that there is no unsigned integer wraparound during the address * computation, excluding all GEPs within base_ptr. */ -LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) +LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, true, true, true); + return ac_build_load_custom(ctx, base_ptr, index, true, true, true); } /* See ac_build_load_custom() documentation. */ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) + LLVMValueRef base_ptr, LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, true, true, false); + return ac_build_load_custom(ctx, base_ptr, index, true, true, false); } -static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, - unsigned cache_policy) +static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy) { - return cache_policy | - (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0); + return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0); } -static void -ac_build_buffer_store_common(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy, - bool use_format, - bool structurized) +static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef data, LLVMValueRef vindex, + LLVMValueRef voffset, LLVMValueRef soffset, + unsigned cache_policy, bool use_format, bool structurized) { - LLVMValueRef args[6]; - int idx = 0; - args[idx++] = data; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; + LLVMValueRef args[6]; + int idx = 0; + args[idx++] = data; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; - ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name)); + ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name)); - if (use_format) { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", - indexing_kind, type_name); - } else { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", - indexing_kind, type_name); - } + if (use_format) { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind, + type_name); + } else { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name); + } - ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); + ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); } -void -ac_build_buffer_store_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned cache_policy) +void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, + LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy) { - ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, - cache_policy, true, true); + ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true); } /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), * or v4i32 (num_channels=3,4). */ -void -ac_build_buffer_store_dword(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy) -{ - /* Split 3 channel stores, because only LLVM 9+ support 3-channel - * intrinsics. */ - if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) { - LLVMValueRef v[3], v01; - - for (int i = 0; i < 3; i++) { - v[i] = LLVMBuildExtractElement(ctx->builder, vdata, - LLVMConstInt(ctx->i32, i, 0), ""); - } - v01 = ac_build_gather_values(ctx, v, 2); - - ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, - soffset, inst_offset, cache_policy); - ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, - soffset, inst_offset + 8, - cache_policy); - return; - } - - /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset - * (voffset is swizzled, but soffset isn't swizzled). - * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. - */ - if (!(cache_policy & ac_swizzled)) { - LLVMValueRef offset = soffset; - - if (inst_offset) - offset = LLVMBuildAdd(ctx->builder, offset, - LLVMConstInt(ctx->i32, inst_offset, 0), ""); - - ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), - ctx->i32_0, voffset, offset, - cache_policy, false, false); - return; - } - - static const unsigned dfmts[] = { - V_008F0C_BUF_DATA_FORMAT_32, - V_008F0C_BUF_DATA_FORMAT_32_32, - V_008F0C_BUF_DATA_FORMAT_32_32_32, - V_008F0C_BUF_DATA_FORMAT_32_32_32_32 - }; - unsigned dfmt = dfmts[num_channels - 1]; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0); - - ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, cache_policy); -} - -static LLVMValueRef -ac_build_buffer_load_common(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned num_channels, - LLVMTypeRef channel_type, - unsigned cache_policy, - bool can_speculate, - bool use_format, - bool structurized) -{ - LLVMValueRef args[5]; - int idx = 0; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - /* D16 is only supported on gfx8+ */ - assert(!use_format || - (channel_type != ctx->f16 && channel_type != ctx->i16) || - ctx->chip_class >= GFX8); - - LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - if (use_format) { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", - indexing_kind, type_name); - } else { - snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", - indexing_kind, type_name); - } - - return ac_build_intrinsic(ctx, name, type, args, idx, - ac_get_load_intr_attribs(can_speculate)); -} - -LLVMValueRef -ac_build_buffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - int num_channels, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy, - bool can_speculate, - bool allow_smem) -{ - LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); - if (voffset) - offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); - if (soffset) - offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); - - if (allow_smem && !(cache_policy & ac_slc) && - (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) { - assert(vindex == NULL); - - LLVMValueRef result[8]; - - for (int i = 0; i < num_channels; i++) { - if (i) { - offset = LLVMBuildAdd(ctx->builder, offset, - LLVMConstInt(ctx->i32, 4, 0), ""); - } - LLVMValueRef args[3] = { - rsrc, - offset, - LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), - }; - result[i] = ac_build_intrinsic(ctx, - "llvm.amdgcn.s.buffer.load.f32", - ctx->f32, args, 3, - AC_FUNC_ATTR_READNONE); - } - if (num_channels == 1) - return result[0]; - - if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) - result[num_channels++] = LLVMGetUndef(ctx->f32); - return ac_build_gather_values(ctx, result, num_channels); - } - - return ac_build_buffer_load_common(ctx, rsrc, vindex, - offset, ctx->i32_0, - num_channels, ctx->f32, - cache_policy, - can_speculate, false, false); -} - -LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate, - bool d16) -{ - return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, - ctx->i32_0, num_channels, - d16 ? ctx->f16 : ctx->f32, - cache_policy, can_speculate, - true, true); -} - -static LLVMValueRef -ac_build_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate, - bool structurized) -{ - voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); - - LLVMValueRef args[6]; - int idx = 0; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); - args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", - indexing_kind, type_name); - - return ac_build_intrinsic(ctx, name, type, args, idx, - ac_get_load_intr_attribs(can_speculate)); -} - -LLVMValueRef -ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate) -{ - return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, - cache_policy, can_speculate, true); -} - -LLVMValueRef -ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate) -{ - return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, - cache_policy, can_speculate, false); -} - -LLVMValueRef -ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy) -{ - LLVMValueRef res; - - if (LLVM_VERSION_MAJOR >= 9) { - voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); - - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - res = ac_build_buffer_load_common(ctx, rsrc, NULL, - voffset, soffset, - 1, ctx->i16, cache_policy, - false, false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, - immoffset, 1, dfmt, nfmt, cache_policy, - false); - - res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, ""); - } - - return res; -} - -LLVMValueRef -ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy) -{ - LLVMValueRef res; - - if (LLVM_VERSION_MAJOR >= 9) { - voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); - - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - res = ac_build_buffer_load_common(ctx, rsrc, NULL, - voffset, soffset, - 1, ctx->i8, cache_policy, - false, false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; +void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, + unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset, + unsigned inst_offset, unsigned cache_policy) +{ + /* Split 3 channel stores, because only LLVM 9+ support 3-channel + * intrinsics. */ + if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) { + LLVMValueRef v[3], v01; + + for (int i = 0; i < 3; i++) { + v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), ""); + } + v01 = ac_build_gather_values(ctx, v, 2); + + ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy); + ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8, + cache_policy); + return; + } + + /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset + * (voffset is swizzled, but soffset isn't swizzled). + * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter. + */ + if (!(cache_policy & ac_swizzled)) { + LLVMValueRef offset = soffset; + + if (inst_offset) + offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), ""); + + ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset, + cache_policy, false, false); + return; + } + + static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32_32}; + unsigned dfmt = dfmts[num_channels - 1]; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0); + + ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt, + nfmt, cache_policy); +} + +static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, unsigned num_channels, + LLVMTypeRef channel_type, unsigned cache_policy, + bool can_speculate, bool use_format, + bool structurized) +{ + LLVMValueRef args[5]; + int idx = 0; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); + unsigned func = + !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + /* D16 is only supported on gfx8+ */ + assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) || + ctx->chip_class >= GFX8); + + LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + if (use_format) { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind, + type_name); + } else { + snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name); + } + + return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate)); +} + +LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, + LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, + unsigned inst_offset, unsigned cache_policy, bool can_speculate, + bool allow_smem) +{ + LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0); + if (voffset) + offset = LLVMBuildAdd(ctx->builder, offset, voffset, ""); + if (soffset) + offset = LLVMBuildAdd(ctx->builder, offset, soffset, ""); + + if (allow_smem && !(cache_policy & ac_slc) && + (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) { + assert(vindex == NULL); + + LLVMValueRef result[8]; + + for (int i = 0; i < num_channels; i++) { + if (i) { + offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), ""); + } + LLVMValueRef args[3] = { + rsrc, + offset, + LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0), + }; + result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3, + AC_FUNC_ATTR_READNONE); + } + if (num_channels == 1) + return result[0]; + + if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) + result[num_channels++] = LLVMGetUndef(ctx->f32); + return ac_build_gather_values(ctx, result, num_channels); + } + + return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, ctx->f32, + cache_policy, can_speculate, false, false); +} + +LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + unsigned num_channels, unsigned cache_policy, + bool can_speculate, bool d16) +{ + return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels, + d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true, + true); +} + +static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy, bool can_speculate, + bool structurized) +{ + voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); + + LLVMValueRef args[6]; + int idx = 0; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); + args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0); + unsigned func = + !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name); + + return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate)); +} - res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, - immoffset, 1, dfmt, nfmt, cache_policy, - false); +LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy, bool can_speculate) +{ + return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt, + nfmt, cache_policy, can_speculate, true); +} + +LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef voffset, LLVMValueRef soffset, + LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt, + unsigned nfmt, unsigned cache_policy, bool can_speculate) +{ + return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, immoffset, num_channels, dfmt, + nfmt, cache_policy, can_speculate, false); +} + +LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef voffset, LLVMValueRef soffset, + LLVMValueRef immoffset, unsigned cache_policy) +{ + LLVMValueRef res; + + if (LLVM_VERSION_MAJOR >= 9) { + voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); + + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16, + cache_policy, false, false, false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt, + cache_policy, false); + + res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, ""); + } + + return res; +} + +LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef voffset, LLVMValueRef soffset, + LLVMValueRef immoffset, unsigned cache_policy) +{ + LLVMValueRef res; + + if (LLVM_VERSION_MAJOR >= 9) { + voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, ""); + + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy, + false, false, false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt, + cache_policy, false); - res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, ""); - } - - return res; + res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, ""); + } + + return res; } /** @@ -1553,62 +1338,63 @@ ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, * The input exponent is expected to be biased analogous to IEEE-754, i.e. by * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs). */ -static LLVMValueRef -ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits) +static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned exp_bits, unsigned mant_bits) { - assert(LLVMTypeOf(src) == ctx->i32); + assert(LLVMTypeOf(src) == ctx->i32); - LLVMValueRef tmp; - LLVMValueRef mantissa; - mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), ""); + LLVMValueRef tmp; + LLVMValueRef mantissa; + mantissa = + LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), ""); - /* Converting normal numbers is just a shift + correcting the exponent bias */ - unsigned normal_shift = 23 - mant_bits; - unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); - LLVMValueRef shifted, normal; + /* Converting normal numbers is just a shift + correcting the exponent bias */ + unsigned normal_shift = 23 - mant_bits; + unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); + LLVMValueRef shifted, normal; - shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), ""); - normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), ""); + shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), ""); + normal = + LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), ""); - /* Converting nan/inf numbers is the same, but with a different exponent update */ - LLVMValueRef naninf; - naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), ""); + /* Converting nan/inf numbers is the same, but with a different exponent update */ + LLVMValueRef naninf; + naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), ""); - /* Converting denormals is the complex case: determine the leading zeros of the - * mantissa to obtain the correct shift for the mantissa and exponent correction. - */ - LLVMValueRef denormal; - LLVMValueRef params[2] = { - mantissa, - ctx->i1true, /* result can be undef when arg is 0 */ - }; - LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, - params, 2, AC_FUNC_ATTR_READNONE); + /* Converting denormals is the complex case: determine the leading zeros of the + * mantissa to obtain the correct shift for the mantissa and exponent correction. + */ + LLVMValueRef denormal; + LLVMValueRef params[2] = { + mantissa, ctx->i1true, /* result can be undef when arg is 0 */ + }; + LLVMValueRef ctlz = + ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE); - /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ - tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), ""); - denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, ""); + /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ + tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), ""); + denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, ""); - unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; - tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, ""); - tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), ""); - denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, ""); + unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; + tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, ""); + tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), ""); + denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, ""); - /* Select the final result. */ - LLVMValueRef result; + /* Select the final result. */ + LLVMValueRef result; - tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, - LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), ""); - result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, ""); + tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, + LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), ""); + result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, ""); - tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, - LLVMConstInt(ctx->i32, 1 << mant_bits, false), ""); - result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, ""); + tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, LLVMConstInt(ctx->i32, 1 << mant_bits, false), + ""); + result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, ""); - tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, ""); - result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, ""); + tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, ""); + result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, ""); - return ac_to_float(ctx, result); + return ac_to_float(ctx, result); } /** @@ -1629,354 +1415,305 @@ ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits * \param rsrc buffer resource descriptor * \return the resulting vector of floats or integers bitcast to <4 x i32> */ -LLVMValueRef -ac_build_opencoded_load_format(struct ac_llvm_context *ctx, - unsigned log_size, - unsigned num_channels, - unsigned format, - bool reverse, - bool known_aligned, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy, - bool can_speculate) -{ - LLVMValueRef tmp; - unsigned load_log_size = log_size; - unsigned load_num_channels = num_channels; - if (log_size == 3) { - load_log_size = 2; - if (format == AC_FETCH_FORMAT_FLOAT) { - load_num_channels = 2 * num_channels; - } else { - load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ - } - } - - int log_recombine = 0; - if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) { - /* Avoid alignment restrictions by loading one byte at a time. */ - load_num_channels <<= load_log_size; - log_recombine = load_log_size; - load_log_size = 0; - } else if (load_num_channels == 2 || load_num_channels == 4) { - log_recombine = -util_logbase2(load_num_channels); - load_num_channels = 1; - load_log_size += -log_recombine; - } - - assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9); - - LLVMValueRef loads[32]; /* up to 32 bytes */ - for (unsigned i = 0; i < load_num_channels; ++i) { - tmp = LLVMBuildAdd(ctx->builder, soffset, - LLVMConstInt(ctx->i32, i << load_log_size, false), ""); - LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 : - load_log_size == 1 ? ctx->i16 : ctx->i32; - unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); - loads[i] = ac_build_buffer_load_common( - ctx, rsrc, vindex, voffset, tmp, - num_channels, channel_type, cache_policy, - can_speculate, false, true); - if (load_log_size >= 2) - loads[i] = ac_to_integer(ctx, loads[i]); - } - - if (log_recombine > 0) { - /* Recombine bytes if necessary (GFX6 only) */ - LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16; - - for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { - LLVMValueRef accum = NULL; - for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { - tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, ""); - if (i == 0) { - accum = tmp; - } else { - tmp = LLVMBuildShl(ctx->builder, tmp, - LLVMConstInt(dst_type, 8 * i, false), ""); - accum = LLVMBuildOr(ctx->builder, accum, tmp, ""); - } - } - loads[dst] = accum; - } - } else if (log_recombine < 0) { - /* Split vectors of dwords */ - if (load_log_size > 2) { - assert(load_num_channels == 1); - LLVMValueRef loaded = loads[0]; - unsigned log_split = load_log_size - 2; - log_recombine += log_split; - load_num_channels = 1 << log_split; - load_log_size = 2; - for (unsigned i = 0; i < load_num_channels; ++i) { - tmp = LLVMConstInt(ctx->i32, i, false); - loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, ""); - } - } - - /* Further split dwords and shorts if required */ - if (log_recombine < 0) { - for (unsigned src = load_num_channels, - dst = load_num_channels << -log_recombine; - src > 0; --src) { - unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); - LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits); - LLVMValueRef loaded = loads[src - 1]; - LLVMTypeRef loaded_type = LLVMTypeOf(loaded); - for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { - tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false); - tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, ""); - loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, ""); - } - } - } - } - - if (log_size == 3) { - if (format == AC_FETCH_FORMAT_FLOAT) { - for (unsigned i = 0; i < num_channels; ++i) { - tmp = ac_build_gather_values(ctx, &loads[2 * i], 2); - loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, ""); - } - } else if (format == AC_FETCH_FORMAT_FIXED) { - /* 10_11_11_FLOAT */ - LLVMValueRef data = loads[0]; - LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false); - LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), ""); - LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, ""); - LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), ""); - - loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6)); - loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6)); - loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5)); - - num_channels = 3; - log_size = 2; - format = AC_FETCH_FORMAT_FLOAT; - } else { - /* 2_10_10_10 data formats */ - LLVMValueRef data = loads[0]; - LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10); - LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2); - loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), ""); - loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), ""); - loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); - tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), ""); - loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, ""); - - num_channels = 4; - } - } - - if (format == AC_FETCH_FORMAT_FLOAT) { - if (log_size != 2) { - for (unsigned chan = 0; chan < num_channels; ++chan) { - tmp = ac_to_float(ctx, loads[chan]); - if (log_size == 3) - tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, ""); - else if (log_size == 1) - tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, ""); - loads[chan] = ac_to_integer(ctx, tmp); - } - } - } else if (format == AC_FETCH_FORMAT_UINT) { - if (log_size != 2) { - for (unsigned chan = 0; chan < num_channels; ++chan) - loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, ""); - } - } else if (format == AC_FETCH_FORMAT_SINT) { - if (log_size != 2) { - for (unsigned chan = 0; chan < num_channels; ++chan) - loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, ""); - } - } else { - bool unsign = format == AC_FETCH_FORMAT_UNORM || - format == AC_FETCH_FORMAT_USCALED || - format == AC_FETCH_FORMAT_UINT; - - for (unsigned chan = 0; chan < num_channels; ++chan) { - if (unsign) { - tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, ""); - } else { - tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, ""); - } - - LLVMValueRef scale = NULL; - if (format == AC_FETCH_FORMAT_FIXED) { - assert(log_size == 2); - scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000); - } else if (format == AC_FETCH_FORMAT_UNORM) { - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); - scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1)); - } else if (format == AC_FETCH_FORMAT_SNORM) { - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); - scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1)); - } - if (scale) - tmp = LLVMBuildFMul(ctx->builder, tmp, scale, ""); - - if (format == AC_FETCH_FORMAT_SNORM) { - /* Clamp to [-1, 1] */ - LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); - LLVMValueRef clamp = - LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, ""); - } - - loads[chan] = ac_to_integer(ctx, tmp); - } - } - - while (num_channels < 4) { - if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) { - loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0; - } else { - loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0); - } - num_channels++; - } - - if (reverse) { - tmp = loads[0]; - loads[0] = loads[2]; - loads[2] = tmp; - } - - return ac_build_gather_values(ctx, loads, 4); -} - -static void -ac_build_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool structurized) -{ - voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, - immoffset, ""); - - LLVMValueRef args[7]; - int idx = 0; - args[idx++] = vdata; - args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); - if (structurized) - args[idx++] = vindex ? vindex : ctx->i32_0; - args[idx++] = voffset ? voffset : ctx->i32_0; - args[idx++] = soffset ? soffset : ctx->i32_0; - args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); - args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); - unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; - const char *indexing_kind = structurized ? "struct" : "raw"; - char name[256], type_name[8]; - - LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; - ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); - - snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", - indexing_kind, type_name); - - ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); -} - -void -ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy) -{ - ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, cache_policy, - true); -} - -void -ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy) -{ - ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, - immoffset, num_channels, dfmt, nfmt, cache_policy, - false); -} - -void -ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy) -{ - vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); - - if (LLVM_VERSION_MAJOR >= 9) { - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, - voffset, soffset, cache_policy, - false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); - - ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, - ctx->i32_0, 1, dfmt, nfmt, cache_policy); - } -} - -void -ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy) -{ - vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); - - if (LLVM_VERSION_MAJOR >= 9) { - /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ - ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, - voffset, soffset, cache_policy, - false, false); - } else { - unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; - unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; - - vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); - - ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, - ctx->i32_0, 1, dfmt, nfmt, cache_policy); - } +LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size, + unsigned num_channels, unsigned format, bool reverse, + bool known_aligned, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, unsigned cache_policy, + bool can_speculate) +{ + LLVMValueRef tmp; + unsigned load_log_size = log_size; + unsigned load_num_channels = num_channels; + if (log_size == 3) { + load_log_size = 2; + if (format == AC_FETCH_FORMAT_FLOAT) { + load_num_channels = 2 * num_channels; + } else { + load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ + } + } + + int log_recombine = 0; + if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) { + /* Avoid alignment restrictions by loading one byte at a time. */ + load_num_channels <<= load_log_size; + log_recombine = load_log_size; + load_log_size = 0; + } else if (load_num_channels == 2 || load_num_channels == 4) { + log_recombine = -util_logbase2(load_num_channels); + load_num_channels = 1; + load_log_size += -log_recombine; + } + + assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9); + + LLVMValueRef loads[32]; /* up to 32 bytes */ + for (unsigned i = 0; i < load_num_channels; ++i) { + tmp = + LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), ""); + LLVMTypeRef channel_type = + load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32; + unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); + loads[i] = + ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type, + cache_policy, can_speculate, false, true); + if (load_log_size >= 2) + loads[i] = ac_to_integer(ctx, loads[i]); + } + + if (log_recombine > 0) { + /* Recombine bytes if necessary (GFX6 only) */ + LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16; + + for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { + LLVMValueRef accum = NULL; + for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { + tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, ""); + if (i == 0) { + accum = tmp; + } else { + tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), ""); + accum = LLVMBuildOr(ctx->builder, accum, tmp, ""); + } + } + loads[dst] = accum; + } + } else if (log_recombine < 0) { + /* Split vectors of dwords */ + if (load_log_size > 2) { + assert(load_num_channels == 1); + LLVMValueRef loaded = loads[0]; + unsigned log_split = load_log_size - 2; + log_recombine += log_split; + load_num_channels = 1 << log_split; + load_log_size = 2; + for (unsigned i = 0; i < load_num_channels; ++i) { + tmp = LLVMConstInt(ctx->i32, i, false); + loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, ""); + } + } + + /* Further split dwords and shorts if required */ + if (log_recombine < 0) { + for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0; + --src) { + unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); + LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits); + LLVMValueRef loaded = loads[src - 1]; + LLVMTypeRef loaded_type = LLVMTypeOf(loaded); + for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { + tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false); + tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, ""); + loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, ""); + } + } + } + } + + if (log_size == 3) { + if (format == AC_FETCH_FORMAT_FLOAT) { + for (unsigned i = 0; i < num_channels; ++i) { + tmp = ac_build_gather_values(ctx, &loads[2 * i], 2); + loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, ""); + } + } else if (format == AC_FETCH_FORMAT_FIXED) { + /* 10_11_11_FLOAT */ + LLVMValueRef data = loads[0]; + LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false); + LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), ""); + LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, ""); + LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), ""); + + loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6)); + loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6)); + loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5)); + + num_channels = 3; + log_size = 2; + format = AC_FETCH_FORMAT_FLOAT; + } else { + /* 2_10_10_10 data formats */ + LLVMValueRef data = loads[0]; + LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10); + LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2); + loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), ""); + loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), ""); + loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, ""); + tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), ""); + loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, ""); + + num_channels = 4; + } + } + + if (format == AC_FETCH_FORMAT_FLOAT) { + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) { + tmp = ac_to_float(ctx, loads[chan]); + if (log_size == 3) + tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, ""); + else if (log_size == 1) + tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, ""); + loads[chan] = ac_to_integer(ctx, tmp); + } + } + } else if (format == AC_FETCH_FORMAT_UINT) { + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, ""); + } + } else if (format == AC_FETCH_FORMAT_SINT) { + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, ""); + } + } else { + bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED || + format == AC_FETCH_FORMAT_UINT; + + for (unsigned chan = 0; chan < num_channels; ++chan) { + if (unsign) { + tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, ""); + } else { + tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, ""); + } + + LLVMValueRef scale = NULL; + if (format == AC_FETCH_FORMAT_FIXED) { + assert(log_size == 2); + scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000); + } else if (format == AC_FETCH_FORMAT_UNORM) { + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); + scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1)); + } else if (format == AC_FETCH_FORMAT_SNORM) { + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan])); + scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1)); + } + if (scale) + tmp = LLVMBuildFMul(ctx->builder, tmp, scale, ""); + + if (format == AC_FETCH_FORMAT_SNORM) { + /* Clamp to [-1, 1] */ + LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); + LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, ""); + } + + loads[chan] = ac_to_integer(ctx, tmp); + } + } + + while (num_channels < 4) { + if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) { + loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0; + } else { + loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0); + } + num_channels++; + } + + if (reverse) { + tmp = loads[0]; + loads[0] = loads[2]; + loads[2] = tmp; + } + + return ac_build_gather_values(ctx, loads, 4); +} + +static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy, bool structurized) +{ + voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, ""); + + LLVMValueRef args[7]; + int idx = 0; + args[idx++] = vdata; + args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""); + if (structurized) + args[idx++] = vindex ? vindex : ctx->i32_0; + args[idx++] = voffset ? voffset : ctx->i32_0; + args[idx++] = soffset ? soffset : ctx->i32_0; + args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0); + args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0); + unsigned func = + !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels; + const char *indexing_kind = structurized ? "struct" : "raw"; + char name[256], type_name[8]; + + LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32; + ac_build_type_name_for_intr(type, type_name, sizeof(type_name)); + + snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name); + + ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY); +} + +void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy) +{ + ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt, + nfmt, cache_policy, true); +} + +void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, + LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy) +{ + ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt, + nfmt, cache_policy, false); +} + +void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset, + unsigned cache_policy) +{ + vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, ""); + + if (LLVM_VERSION_MAJOR >= 9) { + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false, + false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); + + ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt, + cache_policy); + } +} + +void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, + LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy) +{ + vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, ""); + + if (LLVM_VERSION_MAJOR >= 9) { + /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */ + ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false, + false); + } else { + unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8; + unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + + vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, ""); + + ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt, + cache_policy); + } } /** * Set range metadata on an instruction. This can only be used on load and @@ -1985,40 +1722,37 @@ ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, * \p lo is the minimum value inclusive. * \p hi is the maximum value exclusive. */ -static void set_range_metadata(struct ac_llvm_context *ctx, - LLVMValueRef value, unsigned lo, unsigned hi) +static void set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo, + unsigned hi) { - LLVMValueRef range_md, md_args[2]; - LLVMTypeRef type = LLVMTypeOf(value); - LLVMContextRef context = LLVMGetTypeContext(type); + LLVMValueRef range_md, md_args[2]; + LLVMTypeRef type = LLVMTypeOf(value); + LLVMContextRef context = LLVMGetTypeContext(type); - md_args[0] = LLVMConstInt(type, lo, false); - md_args[1] = LLVMConstInt(type, hi, false); - range_md = LLVMMDNodeInContext(context, md_args, 2); - LLVMSetMetadata(value, ctx->range_md_kind, range_md); + md_args[0] = LLVMConstInt(type, lo, false); + md_args[1] = LLVMConstInt(type, hi, false); + range_md = LLVMMDNodeInContext(context, md_args, 2); + LLVMSetMetadata(value, ctx->range_md_kind, range_md); } -LLVMValueRef -ac_get_thread_id(struct ac_llvm_context *ctx) +LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx) { - LLVMValueRef tid; + LLVMValueRef tid; - LLVMValueRef tid_args[2]; - tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false); - tid_args[1] = ctx->i32_0; - tid_args[1] = ac_build_intrinsic(ctx, - "llvm.amdgcn.mbcnt.lo", ctx->i32, - tid_args, 2, AC_FUNC_ATTR_READNONE); + LLVMValueRef tid_args[2]; + tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false); + tid_args[1] = ctx->i32_0; + tid_args[1] = + ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, tid_args, 2, AC_FUNC_ATTR_READNONE); - if (ctx->wave_size == 32) { - tid = tid_args[1]; - } else { - tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", - ctx->i32, tid_args, - 2, AC_FUNC_ATTR_READNONE); - } - set_range_metadata(ctx, tid, 0, ctx->wave_size); - return tid; + if (ctx->wave_size == 32) { + tid = tid_args[1]; + } else { + tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, tid_args, 2, + AC_FUNC_ATTR_READNONE); + } + set_range_metadata(ctx, tid, 0, ctx->wave_size); + return tid; } /* @@ -2045,1506 +1779,1395 @@ ac_get_thread_id(struct ac_llvm_context *ctx) * Adding 1 yields the TID of the pixel to the right of the left pixel, and * adding 2 yields the TID of the pixel below the top pixel. */ -LLVMValueRef -ac_build_ddxy(struct ac_llvm_context *ctx, - uint32_t mask, - int idx, - LLVMValueRef val) -{ - unsigned tl_lanes[4], trbl_lanes[4]; - char name[32], type[8]; - LLVMValueRef tl, trbl; - LLVMTypeRef result_type; - LLVMValueRef result; - - result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); - - if (result_type == ctx->f16) - val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); - else if (result_type == ctx->v2f16) - val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, ""); - - for (unsigned i = 0; i < 4; ++i) { - tl_lanes[i] = i & mask; - trbl_lanes[i] = (i & mask) + idx; - } - - tl = ac_build_quad_swizzle(ctx, val, - tl_lanes[0], tl_lanes[1], - tl_lanes[2], tl_lanes[3]); - trbl = ac_build_quad_swizzle(ctx, val, - trbl_lanes[0], trbl_lanes[1], - trbl_lanes[2], trbl_lanes[3]); - - if (result_type == ctx->f16) { - tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); - trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); - } - - tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); - trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); - result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); - - ac_build_type_name_for_intr(result_type, type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); - - return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); -} - -void -ac_build_sendmsg(struct ac_llvm_context *ctx, - uint32_t msg, - LLVMValueRef wave_id) -{ - LLVMValueRef args[2]; - args[0] = LLVMConstInt(ctx->i32, msg, false); - args[1] = wave_id; - ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); -} - -LLVMValueRef -ac_build_imsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type) -{ - LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", - dst_type, &arg, 1, - AC_FUNC_ATTR_READNONE); - - /* The HW returns the last bit index from MSB, but NIR/TGSI wants - * the index from LSB. Invert it by doing "31 - msb". */ - msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), - msb, ""); - - LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); - LLVMValueRef cond = LLVMBuildOr(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntEQ, - arg, ctx->i32_0, ""), - LLVMBuildICmp(ctx->builder, LLVMIntEQ, - arg, all_ones, ""), ""); - - return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); -} - -LLVMValueRef -ac_build_umsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type) -{ - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef highest_bit; - LLVMValueRef zero; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); - switch (bitsize) { - case 64: - intrin_name = "llvm.ctlz.i64"; - type = ctx->i64; - highest_bit = LLVMConstInt(ctx->i64, 63, false); - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.ctlz.i32"; - type = ctx->i32; - highest_bit = LLVMConstInt(ctx->i32, 31, false); - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.ctlz.i16"; - type = ctx->i16; - highest_bit = LLVMConstInt(ctx->i16, 15, false); - zero = ctx->i16_0; - break; - case 8: - intrin_name = "llvm.ctlz.i8"; - type = ctx->i8; - highest_bit = LLVMConstInt(ctx->i8, 7, false); - zero = ctx->i8_0; - break; - default: - unreachable(!"invalid bitsize"); - break; - } +LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val) +{ + unsigned tl_lanes[4], trbl_lanes[4]; + char name[32], type[8]; + LLVMValueRef tl, trbl; + LLVMTypeRef result_type; + LLVMValueRef result; + + result_type = ac_to_float_type(ctx, LLVMTypeOf(val)); + + if (result_type == ctx->f16) + val = LLVMBuildZExt(ctx->builder, val, ctx->i32, ""); + else if (result_type == ctx->v2f16) + val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, ""); + + for (unsigned i = 0; i < 4; ++i) { + tl_lanes[i] = i & mask; + trbl_lanes[i] = (i & mask) + idx; + } + + tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]); + trbl = + ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]); + + if (result_type == ctx->f16) { + tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, ""); + trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, ""); + } + + tl = LLVMBuildBitCast(ctx->builder, tl, result_type, ""); + trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, ""); + result = LLVMBuildFSub(ctx->builder, trbl, tl, ""); + + ac_build_type_name_for_intr(result_type, type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type); + + return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0); +} - LLVMValueRef params[2] = { - arg, - ctx->i1true, - }; +void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id) +{ + LLVMValueRef args[2]; + args[0] = LLVMConstInt(ctx->i32, msg, false); + args[1] = wave_id; + ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0); +} + +LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) +{ + LLVMValueRef msb = + ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE); + + /* The HW returns the last bit index from MSB, but NIR/TGSI wants + * the index from LSB. Invert it by doing "31 - msb". */ + msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, ""); + + LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true); + LLVMValueRef cond = + LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""), + LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), ""); + + return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, ""); +} + +LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type) +{ + const char *intrin_name; + LLVMTypeRef type; + LLVMValueRef highest_bit; + LLVMValueRef zero; + unsigned bitsize; + + bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg)); + switch (bitsize) { + case 64: + intrin_name = "llvm.ctlz.i64"; + type = ctx->i64; + highest_bit = LLVMConstInt(ctx->i64, 63, false); + zero = ctx->i64_0; + break; + case 32: + intrin_name = "llvm.ctlz.i32"; + type = ctx->i32; + highest_bit = LLVMConstInt(ctx->i32, 31, false); + zero = ctx->i32_0; + break; + case 16: + intrin_name = "llvm.ctlz.i16"; + type = ctx->i16; + highest_bit = LLVMConstInt(ctx->i16, 15, false); + zero = ctx->i16_0; + break; + case 8: + intrin_name = "llvm.ctlz.i8"; + type = ctx->i8; + highest_bit = LLVMConstInt(ctx->i8, 7, false); + zero = ctx->i8_0; + break; + default: + unreachable(!"invalid bitsize"); + break; + } - LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, - params, 2, - AC_FUNC_ATTR_READNONE); + LLVMValueRef params[2] = { + arg, + ctx->i1true, + }; - /* The HW returns the last bit index from MSB, but TGSI/NIR wants - * the index from LSB. Invert it by doing "31 - msb". */ - msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); + LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE); - if (bitsize == 64) { - msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); - } else if (bitsize < 32) { - msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); - } + /* The HW returns the last bit index from MSB, but TGSI/NIR wants + * the index from LSB. Invert it by doing "31 - msb". */ + msb = LLVMBuildSub(ctx->builder, highest_bit, msb, ""); - /* check for zero */ - return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), - LLVMConstInt(ctx->i32, -1, true), msb, ""); + if (bitsize == 64) { + msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, ""); + } else if (bitsize < 32) { + msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, ""); + } + + /* check for zero */ + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""), + LLVMConstInt(ctx->i32, -1, true), msb, ""); } -LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) +LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - char name[64], type[64]; + char name[64], type[64]; - ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.minnum.%s", type); - LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, - AC_FUNC_ATTR_READNONE); + ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.minnum.%s", type); + LLVMValueRef args[2] = {a, b}; + return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); } -LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) +LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - char name[64], type[64]; + char name[64], type[64]; - ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.maxnum.%s", type); - LLVMValueRef args[2] = {a, b}; - return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, - AC_FUNC_ATTR_READNONE); + ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.maxnum.%s", type); + LLVMValueRef args[2] = {a, b}; + return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE); } -LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) +LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); } -LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) +LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); } -LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) +LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); } -LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b) +LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b) { - LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); - return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); + LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, ""); + return LLVMBuildSelect(ctx->builder, cmp, a, b, ""); } LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value) { - LLVMTypeRef t = LLVMTypeOf(value); - return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), - LLVMConstReal(t, 1.0)); + LLVMTypeRef t = LLVMTypeOf(value); + return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)), + LLVMConstReal(t, 1.0)); } void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a) { - LLVMValueRef args[9]; + LLVMValueRef args[9]; - args[0] = LLVMConstInt(ctx->i32, a->target, 0); - args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); + args[0] = LLVMConstInt(ctx->i32, a->target, 0); + args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0); - if (a->compr) { - args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], - ctx->v2i16, ""); - args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], - ctx->v2i16, ""); - args[4] = LLVMConstInt(ctx->i1, a->done, 0); - args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); + if (a->compr) { + args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, ""); + args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, ""); + args[4] = LLVMConstInt(ctx->i1, a->done, 0); + args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0); - ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", - ctx->voidt, args, 6, 0); - } else { - args[2] = a->out[0]; - args[3] = a->out[1]; - args[4] = a->out[2]; - args[5] = a->out[3]; - args[6] = LLVMConstInt(ctx->i1, a->done, 0); - args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); + ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0); + } else { + args[2] = a->out[0]; + args[3] = a->out[1]; + args[4] = a->out[2]; + args[5] = a->out[3]; + args[6] = LLVMConstInt(ctx->i1, a->done, 0); + args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0); - ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", - ctx->voidt, args, 8, 0); - } + ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0); + } } void ac_build_export_null(struct ac_llvm_context *ctx) { - struct ac_export_args args; + struct ac_export_args args; - args.enabled_channels = 0x0; /* enabled channels */ - args.valid_mask = 1; /* whether the EXEC mask is valid */ - args.done = 1; /* DONE bit */ - args.target = V_008DFC_SQ_EXP_NULL; - args.compr = 0; /* COMPR flag (0 = 32-bit export) */ - args.out[0] = LLVMGetUndef(ctx->f32); /* R */ - args.out[1] = LLVMGetUndef(ctx->f32); /* G */ - args.out[2] = LLVMGetUndef(ctx->f32); /* B */ - args.out[3] = LLVMGetUndef(ctx->f32); /* A */ + args.enabled_channels = 0x0; /* enabled channels */ + args.valid_mask = 1; /* whether the EXEC mask is valid */ + args.done = 1; /* DONE bit */ + args.target = V_008DFC_SQ_EXP_NULL; + args.compr = 0; /* COMPR flag (0 = 32-bit export) */ + args.out[0] = LLVMGetUndef(ctx->f32); /* R */ + args.out[1] = LLVMGetUndef(ctx->f32); /* G */ + args.out[2] = LLVMGetUndef(ctx->f32); /* B */ + args.out[3] = LLVMGetUndef(ctx->f32); /* A */ - ac_build_export(ctx, &args); + ac_build_export(ctx, &args); } static unsigned ac_num_coords(enum ac_image_dim dim) { - switch (dim) { - case ac_image_1d: - return 1; - case ac_image_2d: - case ac_image_1darray: - return 2; - case ac_image_3d: - case ac_image_cube: - case ac_image_2darray: - case ac_image_2dmsaa: - return 3; - case ac_image_2darraymsaa: - return 4; - default: - unreachable("ac_num_coords: bad dim"); - } + switch (dim) { + case ac_image_1d: + return 1; + case ac_image_2d: + case ac_image_1darray: + return 2; + case ac_image_3d: + case ac_image_cube: + case ac_image_2darray: + case ac_image_2dmsaa: + return 3; + case ac_image_2darraymsaa: + return 4; + default: + unreachable("ac_num_coords: bad dim"); + } } static unsigned ac_num_derivs(enum ac_image_dim dim) { - switch (dim) { - case ac_image_1d: - case ac_image_1darray: - return 2; - case ac_image_2d: - case ac_image_2darray: - case ac_image_cube: - return 4; - case ac_image_3d: - return 6; - case ac_image_2dmsaa: - case ac_image_2darraymsaa: - default: - unreachable("derivatives not supported"); - } + switch (dim) { + case ac_image_1d: + case ac_image_1darray: + return 2; + case ac_image_2d: + case ac_image_2darray: + case ac_image_cube: + return 4; + case ac_image_3d: + return 6; + case ac_image_2dmsaa: + case ac_image_2darraymsaa: + default: + unreachable("derivatives not supported"); + } } static const char *get_atomic_name(enum ac_atomic_op op) { - switch (op) { - case ac_atomic_swap: return "swap"; - case ac_atomic_add: return "add"; - case ac_atomic_sub: return "sub"; - case ac_atomic_smin: return "smin"; - case ac_atomic_umin: return "umin"; - case ac_atomic_smax: return "smax"; - case ac_atomic_umax: return "umax"; - case ac_atomic_and: return "and"; - case ac_atomic_or: return "or"; - case ac_atomic_xor: return "xor"; - case ac_atomic_inc_wrap: return "inc"; - case ac_atomic_dec_wrap: return "dec"; - } - unreachable("bad atomic op"); -} - -LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, - struct ac_image_args *a) -{ - const char *overload[3] = { "", "", "" }; - unsigned num_overloads = 0; - LLVMValueRef args[18]; - unsigned num_args = 0; - enum ac_image_dim dim = a->dim; - - assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || - !a->level_zero); - assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && - a->opcode != ac_image_store_mip) || - a->lod); - assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || - (!a->compare && !a->offset)); - assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || - a->opcode == ac_image_get_lod) || - !a->bias); - assert((a->bias ? 1 : 0) + - (a->lod ? 1 : 0) + - (a->level_zero ? 1 : 0) + - (a->derivs[0] ? 1 : 0) <= 1); - assert((a->min_lod ? 1 : 0) + - (a->lod ? 1 : 0) + - (a->level_zero ? 1 : 0) <= 1); - assert(!a->d16 || (ctx->chip_class >= GFX8 && - a->opcode != ac_image_atomic && - a->opcode != ac_image_atomic_cmpswap && - a->opcode != ac_image_get_lod && - a->opcode != ac_image_get_resinfo)); - - if (a->opcode == ac_image_get_lod) { - switch (dim) { - case ac_image_1darray: - dim = ac_image_1d; - break; - case ac_image_2darray: - case ac_image_cube: - dim = ac_image_2d; - break; - default: - break; - } - } - - bool sample = a->opcode == ac_image_sample || - a->opcode == ac_image_gather4 || - a->opcode == ac_image_get_lod; - bool atomic = a->opcode == ac_image_atomic || - a->opcode == ac_image_atomic_cmpswap; - bool load = a->opcode == ac_image_sample || - a->opcode == ac_image_gather4 || - a->opcode == ac_image_load || - a->opcode == ac_image_load_mip; - LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32; - - if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { - args[num_args++] = a->data[0]; - if (a->opcode == ac_image_atomic_cmpswap) - args[num_args++] = a->data[1]; - } - - if (!atomic) - args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false); - - if (a->offset) - args[num_args++] = ac_to_integer(ctx, a->offset); - if (a->bias) { - args[num_args++] = ac_to_float(ctx, a->bias); - overload[num_overloads++] = ".f32"; - } - if (a->compare) - args[num_args++] = ac_to_float(ctx, a->compare); - if (a->derivs[0]) { - unsigned count = ac_num_derivs(dim); - for (unsigned i = 0; i < count; ++i) - args[num_args++] = ac_to_float(ctx, a->derivs[i]); - overload[num_overloads++] = ".f32"; - } - unsigned num_coords = - a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; - for (unsigned i = 0; i < num_coords; ++i) - args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); - if (a->lod) - args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); - if (a->min_lod) - args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, ""); - - overload[num_overloads++] = sample ? ".f32" : ".i32"; - - args[num_args++] = a->resource; - if (sample) { - args[num_args++] = a->sampler; - args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); - } - - args[num_args++] = ctx->i32_0; /* texfailctrl */ - args[num_args++] = LLVMConstInt(ctx->i32, - load ? get_load_cache_policy(ctx, a->cache_policy) : - a->cache_policy, false); - - const char *name; - const char *atomic_subop = ""; - switch (a->opcode) { - case ac_image_sample: name = "sample"; break; - case ac_image_gather4: name = "gather4"; break; - case ac_image_load: name = "load"; break; - case ac_image_load_mip: name = "load.mip"; break; - case ac_image_store: name = "store"; break; - case ac_image_store_mip: name = "store.mip"; break; - case ac_image_atomic: - name = "atomic."; - atomic_subop = get_atomic_name(a->atomic); - break; - case ac_image_atomic_cmpswap: - name = "atomic."; - atomic_subop = "cmpswap"; - break; - case ac_image_get_lod: name = "getlod"; break; - case ac_image_get_resinfo: name = "getresinfo"; break; - default: unreachable("invalid image opcode"); - } - - const char *dimname; - switch (dim) { - case ac_image_1d: dimname = "1d"; break; - case ac_image_2d: dimname = "2d"; break; - case ac_image_3d: dimname = "3d"; break; - case ac_image_cube: dimname = "cube"; break; - case ac_image_1darray: dimname = "1darray"; break; - case ac_image_2darray: dimname = "2darray"; break; - case ac_image_2dmsaa: dimname = "2dmsaa"; break; - case ac_image_2darraymsaa: dimname = "2darraymsaa"; break; - default: unreachable("invalid dim"); - } - - bool lod_suffix = - a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); - char intr_name[96]; - snprintf(intr_name, sizeof(intr_name), - "llvm.amdgcn.image.%s%s" /* base name */ - "%s%s%s%s" /* sample/gather modifiers */ - ".%s.%s%s%s%s", /* dimension and type overloads */ - name, atomic_subop, - a->compare ? ".c" : "", - a->bias ? ".b" : - lod_suffix ? ".l" : - a->derivs[0] ? ".d" : - a->level_zero ? ".lz" : "", - a->min_lod ? ".cl" : "", - a->offset ? ".o" : "", - dimname, - atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"), - overload[0], overload[1], overload[2]); - - LLVMTypeRef retty; - if (atomic) - retty = ctx->i32; - else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) - retty = ctx->voidt; - else - retty = a->d16 ? ctx->v4f16 : ctx->v4f32; - - LLVMValueRef result = - ac_build_intrinsic(ctx, intr_name, retty, args, num_args, - a->attributes); - if (!sample && !atomic && retty != ctx->voidt) - result = ac_to_integer(ctx, result); - - return result; -} - -LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, - LLVMValueRef rsrc) -{ - LLVMValueRef samples; - - /* Read the samples from the descriptor directly. - * Hardware doesn't have any instruction for this. - */ - samples = LLVMBuildExtractElement(ctx->builder, rsrc, - LLVMConstInt(ctx->i32, 3, 0), ""); - samples = LLVMBuildLShr(ctx->builder, samples, - LLVMConstInt(ctx->i32, 16, 0), ""); - samples = LLVMBuildAnd(ctx->builder, samples, - LLVMConstInt(ctx->i32, 0xf, 0), ""); - samples = LLVMBuildShl(ctx->builder, ctx->i32_1, - samples, ""); - return samples; -} - -LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]) -{ - return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, - args, 2, AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]) -{ - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); -} - -LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]) -{ - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); + switch (op) { + case ac_atomic_swap: + return "swap"; + case ac_atomic_add: + return "add"; + case ac_atomic_sub: + return "sub"; + case ac_atomic_smin: + return "smin"; + case ac_atomic_umin: + return "umin"; + case ac_atomic_smax: + return "smax"; + case ac_atomic_umax: + return "umax"; + case ac_atomic_and: + return "and"; + case ac_atomic_or: + return "or"; + case ac_atomic_xor: + return "xor"; + case ac_atomic_inc_wrap: + return "inc"; + case ac_atomic_dec_wrap: + return "dec"; + } + unreachable("bad atomic op"); +} + +LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a) +{ + const char *overload[3] = {"", "", ""}; + unsigned num_overloads = 0; + LLVMValueRef args[18]; + unsigned num_args = 0; + enum ac_image_dim dim = a->dim; + + assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero); + assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip && + a->opcode != ac_image_store_mip) || + a->lod); + assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || + (!a->compare && !a->offset)); + assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || + a->opcode == ac_image_get_lod) || + !a->bias); + assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <= + 1); + assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1); + assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic && + a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod && + a->opcode != ac_image_get_resinfo)); + + if (a->opcode == ac_image_get_lod) { + switch (dim) { + case ac_image_1darray: + dim = ac_image_1d; + break; + case ac_image_2darray: + case ac_image_cube: + dim = ac_image_2d; + break; + default: + break; + } + } + + bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || + a->opcode == ac_image_get_lod; + bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap; + bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || + a->opcode == ac_image_load || a->opcode == ac_image_load_mip; + LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32; + + if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) { + args[num_args++] = a->data[0]; + if (a->opcode == ac_image_atomic_cmpswap) + args[num_args++] = a->data[1]; + } + + if (!atomic) + args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false); + + if (a->offset) + args[num_args++] = ac_to_integer(ctx, a->offset); + if (a->bias) { + args[num_args++] = ac_to_float(ctx, a->bias); + overload[num_overloads++] = ".f32"; + } + if (a->compare) + args[num_args++] = ac_to_float(ctx, a->compare); + if (a->derivs[0]) { + unsigned count = ac_num_derivs(dim); + for (unsigned i = 0; i < count; ++i) + args[num_args++] = ac_to_float(ctx, a->derivs[i]); + overload[num_overloads++] = ".f32"; + } + unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; + for (unsigned i = 0; i < num_coords; ++i) + args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, ""); + if (a->lod) + args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, ""); + if (a->min_lod) + args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, ""); + + overload[num_overloads++] = sample ? ".f32" : ".i32"; + + args[num_args++] = a->resource; + if (sample) { + args[num_args++] = a->sampler; + args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false); + } + + args[num_args++] = ctx->i32_0; /* texfailctrl */ + args[num_args++] = LLVMConstInt( + ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false); + + const char *name; + const char *atomic_subop = ""; + switch (a->opcode) { + case ac_image_sample: + name = "sample"; + break; + case ac_image_gather4: + name = "gather4"; + break; + case ac_image_load: + name = "load"; + break; + case ac_image_load_mip: + name = "load.mip"; + break; + case ac_image_store: + name = "store"; + break; + case ac_image_store_mip: + name = "store.mip"; + break; + case ac_image_atomic: + name = "atomic."; + atomic_subop = get_atomic_name(a->atomic); + break; + case ac_image_atomic_cmpswap: + name = "atomic."; + atomic_subop = "cmpswap"; + break; + case ac_image_get_lod: + name = "getlod"; + break; + case ac_image_get_resinfo: + name = "getresinfo"; + break; + default: + unreachable("invalid image opcode"); + } + + const char *dimname; + switch (dim) { + case ac_image_1d: + dimname = "1d"; + break; + case ac_image_2d: + dimname = "2d"; + break; + case ac_image_3d: + dimname = "3d"; + break; + case ac_image_cube: + dimname = "cube"; + break; + case ac_image_1darray: + dimname = "1darray"; + break; + case ac_image_2darray: + dimname = "2darray"; + break; + case ac_image_2dmsaa: + dimname = "2dmsaa"; + break; + case ac_image_2darraymsaa: + dimname = "2darraymsaa"; + break; + default: + unreachable("invalid dim"); + } + + bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4); + char intr_name[96]; + snprintf(intr_name, sizeof(intr_name), + "llvm.amdgcn.image.%s%s" /* base name */ + "%s%s%s%s" /* sample/gather modifiers */ + ".%s.%s%s%s%s", /* dimension and type overloads */ + name, atomic_subop, a->compare ? ".c" : "", + a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "", + a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname, + atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"), overload[0], overload[1], overload[2]); + + LLVMTypeRef retty; + if (atomic) + retty = ctx->i32; + else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) + retty = ctx->voidt; + else + retty = a->d16 ? ctx->v4f16 : ctx->v4f32; + + LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes); + if (!sample && !atomic && retty != ctx->voidt) + result = ac_to_integer(ctx, result); + + return result; +} + +LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc) +{ + LLVMValueRef samples; + + /* Read the samples from the descriptor directly. + * Hardware doesn't have any instruction for this. + */ + samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), ""); + samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), ""); + samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), ""); + samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, ""); + return samples; +} + +LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, + AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) +{ + LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, + AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); +} + +LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2]) +{ + LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, + AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); } /* The 8-bit and 10-bit clamping is for HW workarounds. */ -LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi) -{ - assert(bits == 8 || bits == 10 || bits == 16); - - LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, - bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); - LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, - bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); - LLVMValueRef max_alpha = - bits != 10 ? max_rgb : ctx->i32_1; - LLVMValueRef min_alpha = - bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); - - /* Clamp. */ - if (bits != 16) { - for (int i = 0; i < 2; i++) { - bool alpha = hi && i == 1; - args[i] = ac_build_imin(ctx, args[i], - alpha ? max_alpha : max_rgb); - args[i] = ac_build_imax(ctx, args[i], - alpha ? min_alpha : min_rgb); - } - } - - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); +LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, + bool hi) +{ + assert(bits == 8 || bits == 10 || bits == 16); + + LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0); + LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0); + LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1; + LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); + + /* Clamp. */ + if (bits != 16) { + for (int i = 0; i < 2; i++) { + bool alpha = hi && i == 1; + args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb); + args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb); + } + } + + LLVMValueRef res = + ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); } /* The 8-bit and 10-bit clamping is for HW workarounds. */ -LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi) +LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, + bool hi) { - assert(bits == 8 || bits == 10 || bits == 16); + assert(bits == 8 || bits == 10 || bits == 16); - LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, - bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); - LLVMValueRef max_alpha = - bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); + LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0); + LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); - /* Clamp. */ - if (bits != 16) { - for (int i = 0; i < 2; i++) { - bool alpha = hi && i == 1; - args[i] = ac_build_umin(ctx, args[i], - alpha ? max_alpha : max_rgb); - } - } + /* Clamp. */ + if (bits != 16) { + for (int i = 0; i < 2; i++) { + bool alpha = hi && i == 1; + args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb); + } + } - LLVMValueRef res = - ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", - ctx->v2i16, args, 2, - AC_FUNC_ATTR_READNONE); - return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); + LLVMValueRef res = + ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE); + return LLVMBuildBitCast(ctx->builder, res, ctx->i32, ""); } LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1) { - return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, - &i1, 1, AC_FUNC_ATTR_READNONE); + return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE); } void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1) { - ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, - &i1, 1, 0); + ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0); } -LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, - LLVMValueRef offset, LLVMValueRef width, - bool is_signed) +LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset, + LLVMValueRef width, bool is_signed) { - LLVMValueRef args[] = { - input, - offset, - width, - }; - - return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : - "llvm.amdgcn.ubfe.i32", - ctx->i32, args, 3, AC_FUNC_ATTR_READNONE); + LLVMValueRef args[] = { + input, + offset, + width, + }; + return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32", + ctx->i32, args, 3, AC_FUNC_ATTR_READNONE); } -LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2) +LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, + LLVMValueRef s2) { - return LLVMBuildAdd(ctx->builder, - LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); + return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, ""); } -LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2) +LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, + LLVMValueRef s2) { - /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ - if (ctx->chip_class >= GFX10) { - return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, - (LLVMValueRef []) {s0, s1, s2}, 3, - AC_FUNC_ATTR_READNONE); - } + /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ + if (ctx->chip_class >= GFX10) { + return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, + AC_FUNC_ATTR_READNONE); + } - return LLVMBuildFAdd(ctx->builder, - LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); + return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, ""); } void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags) { - if (!wait_flags) - return; - - unsigned lgkmcnt = 63; - unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15; - unsigned vscnt = 63; - - if (wait_flags & AC_WAIT_LGKM) - lgkmcnt = 0; - if (wait_flags & AC_WAIT_VLOAD) - vmcnt = 0; - - if (wait_flags & AC_WAIT_VSTORE) { - if (ctx->chip_class >= GFX10) - vscnt = 0; - else - vmcnt = 0; - } - - /* There is no intrinsic for vscnt(0), so use a fence. */ - if ((wait_flags & AC_WAIT_LGKM && - wait_flags & AC_WAIT_VLOAD && - wait_flags & AC_WAIT_VSTORE) || - vscnt == 0) { - LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, ""); - return; - } - - unsigned simm16 = (lgkmcnt << 8) | - (7 << 4) | /* expcnt */ - (vmcnt & 0xf) | - ((vmcnt >> 4) << 14); - - LLVMValueRef args[1] = { - LLVMConstInt(ctx->i32, simm16, false), - }; - ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", - ctx->voidt, args, 1, 0); -} - -LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.fract.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.fract.f32"; - type = ctx->f32; - } else { - intr = "llvm.amdgcn.fract.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); + if (!wait_flags) + return; + + unsigned lgkmcnt = 63; + unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15; + unsigned vscnt = 63; + + if (wait_flags & AC_WAIT_LGKM) + lgkmcnt = 0; + if (wait_flags & AC_WAIT_VLOAD) + vmcnt = 0; + + if (wait_flags & AC_WAIT_VSTORE) { + if (ctx->chip_class >= GFX10) + vscnt = 0; + else + vmcnt = 0; + } + + /* There is no intrinsic for vscnt(0), so use a fence. */ + if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) || + vscnt == 0) { + LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, ""); + return; + } + + unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */ + (vmcnt & 0xf) | ((vmcnt >> 4) << 14); + + LLVMValueRef args[1] = { + LLVMConstInt(ctx->i32, simm16, false), + }; + ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0); +} + +LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.fract.f16"; + type = ctx->f16; + } else if (bitsize == 32) { + intr = "llvm.amdgcn.fract.f32"; + type = ctx->f32; + } else { + intr = "llvm.amdgcn.fract.f64"; + type = ctx->f64; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); } LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value) { - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { - LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0); - unsigned vec_size = LLVMGetVectorSize(type); - LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef*)); + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0); + unsigned vec_size = LLVMGetVectorSize(type); + LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef *)); - for (unsigned i = 0; i < vec_size; i++) - scalars[i] = scalar; - return LLVMConstVector(scalars, vec_size); - } - return LLVMConstInt(type, value, 0); + for (unsigned i = 0; i < vec_size; i++) + scalars[i] = scalar; + return LLVMConstVector(scalars, vec_size); + } + return LLVMConstInt(type, value, 0); } LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMTypeRef type = LLVMTypeOf(src0); - LLVMValueRef val; + LLVMTypeRef type = LLVMTypeOf(src0); + LLVMValueRef val; - /* v_med3 is selected only when max is first. (LLVM bug?) */ - val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1)); - return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1)); + /* v_med3 is selected only when max is first. (LLVM bug?) */ + val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1)); + return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1)); } static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val) { - ac_enable_signed_zeros(ctx); - /* (val + 0) converts negative zero to positive zero. */ - val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), ""); - ac_disable_signed_zeros(ctx); - return val; + ac_enable_signed_zeros(ctx); + /* (val + 0) converts negative zero to positive zero. */ + val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), ""); + ac_disable_signed_zeros(ctx); + return val; } LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src) { - LLVMTypeRef type = LLVMTypeOf(src); - LLVMValueRef pos, neg, dw[2], val; - unsigned bitsize = ac_get_elem_bits(ctx, type); - - /* The standard version leads to this: - * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004 - * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2 - * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880 - * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3 - * - * The isign version: - * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004 - * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304 - * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04 - * - * (src0 + 0) converts negative zero to positive zero. - * After that, int(fsign(x)) == isign(floatBitsToInt(x)). - * - * For FP64, use the standard version, which doesn't suffer from the huge DP rate - * reduction. (FP64 comparisons are as fast as int64 comparisons) - */ - if (bitsize == 16 || bitsize == 32) { - val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src)); - val = ac_build_isign(ctx, val); - return LLVMBuildSIToFP(ctx->builder, val, type, ""); - } - - assert(bitsize == 64); - pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, ""); - neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, ""); - dw[0] = ctx->i32_0; - dw[1] = LLVMBuildSelect(ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0), - LLVMBuildSelect(ctx->builder, neg, - LLVMConstInt(ctx->i32, 0xBFF00000, 0), - ctx->i32_0, ""), ""); - return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, ""); + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef pos, neg, dw[2], val; + unsigned bitsize = ac_get_elem_bits(ctx, type); + + /* The standard version leads to this: + * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004 + * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2 + * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880 + * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3 + * + * The isign version: + * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004 + * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304 + * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04 + * + * (src0 + 0) converts negative zero to positive zero. + * After that, int(fsign(x)) == isign(floatBitsToInt(x)). + * + * For FP64, use the standard version, which doesn't suffer from the huge DP rate + * reduction. (FP64 comparisons are as fast as int64 comparisons) + */ + if (bitsize == 16 || bitsize == 32) { + val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src)); + val = ac_build_isign(ctx, val); + return LLVMBuildSIToFP(ctx->builder, val, type, ""); + } + + assert(bitsize == 64); + pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, ""); + neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, ""); + dw[0] = ctx->i32_0; + dw[1] = LLVMBuildSelect( + ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0), + LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""), + ""); + return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, ""); } LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef result; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - - switch (bitsize) { - case 128: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 64: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 32: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - case 8: - result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - default: - unreachable(!"invalid bitsize"); - break; - } - - return result; -} - -LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, - LLVMValueRef src0) -{ - LLVMValueRef result; - unsigned bitsize; - - bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - - switch (bitsize) { - case 64: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); - break; - case 32: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - break; - case 16: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - case 8: - result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, - (LLVMValueRef []) { src0 }, 1, - AC_FUNC_ATTR_READNONE); - - result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); - break; - default: - unreachable(!"invalid bitsize"); - break; - } - - return result; -} - -#define AC_EXP_TARGET 0 + LLVMValueRef result; + unsigned bitsize; + + bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + + switch (bitsize) { + case 128: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; + case 64: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; + case 32: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + break; + case 16: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + case 8: + result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + default: + unreachable(!"invalid bitsize"); + break; + } + + return result; +} + +LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0) +{ + LLVMValueRef result; + unsigned bitsize; + + bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + + switch (bitsize) { + case 64: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, ""); + break; + case 32: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + break; + case 16: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + case 8: + result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, + AC_FUNC_ATTR_READNONE); + + result = LLVMBuildZExt(ctx->builder, result, ctx->i32, ""); + break; + default: + unreachable(!"invalid bitsize"); + break; + } + + return result; +} + +#define AC_EXP_TARGET 0 #define AC_EXP_ENABLED_CHANNELS 1 -#define AC_EXP_OUT0 2 +#define AC_EXP_OUT0 2 -enum ac_ir_type { - AC_IR_UNDEF, - AC_IR_CONST, - AC_IR_VALUE, +enum ac_ir_type +{ + AC_IR_UNDEF, + AC_IR_CONST, + AC_IR_VALUE, }; -struct ac_vs_exp_chan -{ - LLVMValueRef value; - float const_float; - enum ac_ir_type type; +struct ac_vs_exp_chan { + LLVMValueRef value; + float const_float; + enum ac_ir_type type; }; struct ac_vs_exp_inst { - unsigned offset; - LLVMValueRef inst; - struct ac_vs_exp_chan chan[4]; + unsigned offset; + LLVMValueRef inst; + struct ac_vs_exp_chan chan[4]; }; struct ac_vs_exports { - unsigned num; - struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; + unsigned num; + struct ac_vs_exp_inst exp[VARYING_SLOT_MAX]; }; /* Return true if the PARAM export has been eliminated. */ -static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, - uint32_t num_outputs, - struct ac_vs_exp_inst *exp) -{ - unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ - bool is_zero[4] = {}, is_one[4] = {}; - - for (i = 0; i < 4; i++) { - /* It's a constant expression. Undef outputs are eliminated too. */ - if (exp->chan[i].type == AC_IR_UNDEF) { - is_zero[i] = true; - is_one[i] = true; - } else if (exp->chan[i].type == AC_IR_CONST) { - if (exp->chan[i].const_float == 0) - is_zero[i] = true; - else if (exp->chan[i].const_float == 1) - is_one[i] = true; - else - return false; /* other constant */ - } else - return false; - } - - /* Only certain combinations of 0 and 1 can be eliminated. */ - if (is_zero[0] && is_zero[1] && is_zero[2]) - default_val = is_zero[3] ? 0 : 1; - else if (is_one[0] && is_one[1] && is_one[2]) - default_val = is_zero[3] ? 2 : 3; - else - return false; - - /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ - LLVMInstructionEraseFromParent(exp->inst); - - /* Change OFFSET to DEFAULT_VAL. */ - for (i = 0; i < num_outputs; i++) { - if (vs_output_param_offset[i] == exp->offset) { - vs_output_param_offset[i] = - AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; - break; - } - } - return true; +static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs, + struct ac_vs_exp_inst *exp) +{ + unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ + bool is_zero[4] = {}, is_one[4] = {}; + + for (i = 0; i < 4; i++) { + /* It's a constant expression. Undef outputs are eliminated too. */ + if (exp->chan[i].type == AC_IR_UNDEF) { + is_zero[i] = true; + is_one[i] = true; + } else if (exp->chan[i].type == AC_IR_CONST) { + if (exp->chan[i].const_float == 0) + is_zero[i] = true; + else if (exp->chan[i].const_float == 1) + is_one[i] = true; + else + return false; /* other constant */ + } else + return false; + } + + /* Only certain combinations of 0 and 1 can be eliminated. */ + if (is_zero[0] && is_zero[1] && is_zero[2]) + default_val = is_zero[3] ? 0 : 1; + else if (is_one[0] && is_one[1] && is_one[2]) + default_val = is_zero[3] ? 2 : 3; + else + return false; + + /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ + LLVMInstructionEraseFromParent(exp->inst); + + /* Change OFFSET to DEFAULT_VAL. */ + for (i = 0; i < num_outputs; i++) { + if (vs_output_param_offset[i] == exp->offset) { + vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val; + break; + } + } + return true; } static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx, - uint8_t *vs_output_param_offset, - uint32_t num_outputs, - struct ac_vs_exports *processed, - struct ac_vs_exp_inst *exp) -{ - unsigned p, copy_back_channels = 0; - - /* See if the output is already in the list of processed outputs. - * The LLVMValueRef comparison relies on SSA. - */ - for (p = 0; p < processed->num; p++) { - bool different = false; - - for (unsigned j = 0; j < 4; j++) { - struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; - struct ac_vs_exp_chan *c2 = &exp->chan[j]; - - /* Treat undef as a match. */ - if (c2->type == AC_IR_UNDEF) - continue; - - /* If c1 is undef but c2 isn't, we can copy c2 to c1 - * and consider the instruction duplicated. - */ - if (c1->type == AC_IR_UNDEF) { - copy_back_channels |= 1 << j; - continue; - } - - /* Test whether the channels are not equal. */ - if (c1->type != c2->type || - (c1->type == AC_IR_CONST && - c1->const_float != c2->const_float) || - (c1->type == AC_IR_VALUE && - c1->value != c2->value)) { - different = true; - break; - } - } - if (!different) - break; - - copy_back_channels = 0; - } - if (p == processed->num) - return false; - - /* If a match was found, but the matching export has undef where the new - * one has a normal value, copy the normal value to the undef channel. - */ - struct ac_vs_exp_inst *match = &processed->exp[p]; - - /* Get current enabled channels mask. */ - LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS); - unsigned enabled_channels = LLVMConstIntGetZExtValue(arg); - - while (copy_back_channels) { - unsigned chan = u_bit_scan(©_back_channels); - - assert(match->chan[chan].type == AC_IR_UNDEF); - LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, - exp->chan[chan].value); - match->chan[chan] = exp->chan[chan]; - - /* Update number of enabled channels because the original mask - * is not always 0xf. - */ - enabled_channels |= (1 << chan); - LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS, - LLVMConstInt(ctx->i32, enabled_channels, 0)); - } - - /* The PARAM export is duplicated. Kill it. */ - LLVMInstructionEraseFromParent(exp->inst); - - /* Change OFFSET to the matching export. */ - for (unsigned i = 0; i < num_outputs; i++) { - if (vs_output_param_offset[i] == exp->offset) { - vs_output_param_offset[i] = match->offset; - break; - } - } - return true; -} - -void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, - LLVMValueRef main_fn, - uint8_t *vs_output_param_offset, - uint32_t num_outputs, - uint32_t skip_output_mask, - uint8_t *num_param_exports) -{ - LLVMBasicBlockRef bb; - bool removed_any = false; - struct ac_vs_exports exports; - - exports.num = 0; - - /* Process all LLVM instructions. */ - bb = LLVMGetFirstBasicBlock(main_fn); - while (bb) { - LLVMValueRef inst = LLVMGetFirstInstruction(bb); - - while (inst) { - LLVMValueRef cur = inst; - inst = LLVMGetNextInstruction(inst); - struct ac_vs_exp_inst exp; - - if (LLVMGetInstructionOpcode(cur) != LLVMCall) - continue; - - LLVMValueRef callee = ac_llvm_get_called_value(cur); - - if (!ac_llvm_is_function(callee)) - continue; - - const char *name = LLVMGetValueName(callee); - unsigned num_args = LLVMCountParams(callee); - - /* Check if this is an export instruction. */ - if ((num_args != 9 && num_args != 8) || - (strcmp(name, "llvm.SI.export") && - strcmp(name, "llvm.amdgcn.exp.f32"))) - continue; - - LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); - unsigned target = LLVMConstIntGetZExtValue(arg); - - if (target < V_008DFC_SQ_EXP_PARAM) - continue; - - target -= V_008DFC_SQ_EXP_PARAM; - - /* Parse the instruction. */ - memset(&exp, 0, sizeof(exp)); - exp.offset = target; - exp.inst = cur; - - for (unsigned i = 0; i < 4; i++) { - LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); - - exp.chan[i].value = v; - - if (LLVMIsUndef(v)) { - exp.chan[i].type = AC_IR_UNDEF; - } else if (LLVMIsAConstantFP(v)) { - LLVMBool loses_info; - exp.chan[i].type = AC_IR_CONST; - exp.chan[i].const_float = - LLVMConstRealGetDouble(v, &loses_info); - } else { - exp.chan[i].type = AC_IR_VALUE; - } - } - - /* Eliminate constant and duplicated PARAM exports. */ - if (!((1u << target) & skip_output_mask) && - (ac_eliminate_const_output(vs_output_param_offset, - num_outputs, &exp) || - ac_eliminate_duplicated_output(ctx, - vs_output_param_offset, - num_outputs, &exports, - &exp))) { - removed_any = true; - } else { - exports.exp[exports.num++] = exp; - } - } - bb = LLVMGetNextBasicBlock(bb); - } - - /* Remove holes in export memory due to removed PARAM exports. - * This is done by renumbering all PARAM exports. - */ - if (removed_any) { - uint8_t old_offset[VARYING_SLOT_MAX]; - unsigned out, i; - - /* Make a copy of the offsets. We need the old version while - * we are modifying some of them. */ - memcpy(old_offset, vs_output_param_offset, - sizeof(old_offset)); - - for (i = 0; i < exports.num; i++) { - unsigned offset = exports.exp[i].offset; - - /* Update vs_output_param_offset. Multiple outputs can - * have the same offset. - */ - for (out = 0; out < num_outputs; out++) { - if (old_offset[out] == offset) - vs_output_param_offset[out] = i; - } - - /* Change the PARAM offset in the instruction. */ - LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, - LLVMConstInt(ctx->i32, - V_008DFC_SQ_EXP_PARAM + i, 0)); - } - *num_param_exports = exports.num; - } + uint8_t *vs_output_param_offset, uint32_t num_outputs, + struct ac_vs_exports *processed, + struct ac_vs_exp_inst *exp) +{ + unsigned p, copy_back_channels = 0; + + /* See if the output is already in the list of processed outputs. + * The LLVMValueRef comparison relies on SSA. + */ + for (p = 0; p < processed->num; p++) { + bool different = false; + + for (unsigned j = 0; j < 4; j++) { + struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j]; + struct ac_vs_exp_chan *c2 = &exp->chan[j]; + + /* Treat undef as a match. */ + if (c2->type == AC_IR_UNDEF) + continue; + + /* If c1 is undef but c2 isn't, we can copy c2 to c1 + * and consider the instruction duplicated. + */ + if (c1->type == AC_IR_UNDEF) { + copy_back_channels |= 1 << j; + continue; + } + + /* Test whether the channels are not equal. */ + if (c1->type != c2->type || + (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) || + (c1->type == AC_IR_VALUE && c1->value != c2->value)) { + different = true; + break; + } + } + if (!different) + break; + + copy_back_channels = 0; + } + if (p == processed->num) + return false; + + /* If a match was found, but the matching export has undef where the new + * one has a normal value, copy the normal value to the undef channel. + */ + struct ac_vs_exp_inst *match = &processed->exp[p]; + + /* Get current enabled channels mask. */ + LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS); + unsigned enabled_channels = LLVMConstIntGetZExtValue(arg); + + while (copy_back_channels) { + unsigned chan = u_bit_scan(©_back_channels); + + assert(match->chan[chan].type == AC_IR_UNDEF); + LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value); + match->chan[chan] = exp->chan[chan]; + + /* Update number of enabled channels because the original mask + * is not always 0xf. + */ + enabled_channels |= (1 << chan); + LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS, + LLVMConstInt(ctx->i32, enabled_channels, 0)); + } + + /* The PARAM export is duplicated. Kill it. */ + LLVMInstructionEraseFromParent(exp->inst); + + /* Change OFFSET to the matching export. */ + for (unsigned i = 0; i < num_outputs; i++) { + if (vs_output_param_offset[i] == exp->offset) { + vs_output_param_offset[i] = match->offset; + break; + } + } + return true; +} + +void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn, + uint8_t *vs_output_param_offset, uint32_t num_outputs, + uint32_t skip_output_mask, uint8_t *num_param_exports) +{ + LLVMBasicBlockRef bb; + bool removed_any = false; + struct ac_vs_exports exports; + + exports.num = 0; + + /* Process all LLVM instructions. */ + bb = LLVMGetFirstBasicBlock(main_fn); + while (bb) { + LLVMValueRef inst = LLVMGetFirstInstruction(bb); + + while (inst) { + LLVMValueRef cur = inst; + inst = LLVMGetNextInstruction(inst); + struct ac_vs_exp_inst exp; + + if (LLVMGetInstructionOpcode(cur) != LLVMCall) + continue; + + LLVMValueRef callee = ac_llvm_get_called_value(cur); + + if (!ac_llvm_is_function(callee)) + continue; + + const char *name = LLVMGetValueName(callee); + unsigned num_args = LLVMCountParams(callee); + + /* Check if this is an export instruction. */ + if ((num_args != 9 && num_args != 8) || + (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32"))) + continue; + + LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET); + unsigned target = LLVMConstIntGetZExtValue(arg); + + if (target < V_008DFC_SQ_EXP_PARAM) + continue; + + target -= V_008DFC_SQ_EXP_PARAM; + + /* Parse the instruction. */ + memset(&exp, 0, sizeof(exp)); + exp.offset = target; + exp.inst = cur; + + for (unsigned i = 0; i < 4; i++) { + LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i); + + exp.chan[i].value = v; + + if (LLVMIsUndef(v)) { + exp.chan[i].type = AC_IR_UNDEF; + } else if (LLVMIsAConstantFP(v)) { + LLVMBool loses_info; + exp.chan[i].type = AC_IR_CONST; + exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info); + } else { + exp.chan[i].type = AC_IR_VALUE; + } + } + + /* Eliminate constant and duplicated PARAM exports. */ + if (!((1u << target) & skip_output_mask) && + (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) || + ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports, + &exp))) { + removed_any = true; + } else { + exports.exp[exports.num++] = exp; + } + } + bb = LLVMGetNextBasicBlock(bb); + } + + /* Remove holes in export memory due to removed PARAM exports. + * This is done by renumbering all PARAM exports. + */ + if (removed_any) { + uint8_t old_offset[VARYING_SLOT_MAX]; + unsigned out, i; + + /* Make a copy of the offsets. We need the old version while + * we are modifying some of them. */ + memcpy(old_offset, vs_output_param_offset, sizeof(old_offset)); + + for (i = 0; i < exports.num; i++) { + unsigned offset = exports.exp[i].offset; + + /* Update vs_output_param_offset. Multiple outputs can + * have the same offset. + */ + for (out = 0; out < num_outputs; out++) { + if (old_offset[out] == offset) + vs_output_param_offset[out] = i; + } + + /* Change the PARAM offset in the instruction. */ + LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET, + LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0)); + } + *num_param_exports = exports.num; + } } void ac_init_exec_full_mask(struct ac_llvm_context *ctx) { - LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); - ac_build_intrinsic(ctx, - "llvm.amdgcn.init.exec", ctx->voidt, - &full_mask, 1, AC_FUNC_ATTR_CONVERGENT); + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, + AC_FUNC_ATTR_CONVERGENT); } void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx) { - unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768; - ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0, - LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), - "lds"); -} - -LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr) -{ - return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), ""); -} - -void ac_lds_store(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr, - LLVMValueRef value) -{ - value = ac_to_integer(ctx, value); - ac_build_indexed_store(ctx, ctx->lds, - dw_addr, value); -} - -LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, - LLVMTypeRef dst_type, - LLVMValueRef src0) -{ - unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); - const char *intrin_name; - LLVMTypeRef type; - LLVMValueRef zero; - - switch (src0_bitsize) { - case 64: - intrin_name = "llvm.cttz.i64"; - type = ctx->i64; - zero = ctx->i64_0; - break; - case 32: - intrin_name = "llvm.cttz.i32"; - type = ctx->i32; - zero = ctx->i32_0; - break; - case 16: - intrin_name = "llvm.cttz.i16"; - type = ctx->i16; - zero = ctx->i16_0; - break; - case 8: - intrin_name = "llvm.cttz.i8"; - type = ctx->i8; - zero = ctx->i8_0; - break; - default: - unreachable(!"invalid bitsize"); - } - - LLVMValueRef params[2] = { - src0, - - /* The value of 1 means that ffs(x=0) = undef, so LLVM won't - * add special code to check for x=0. The reason is that - * the LLVM behavior for x=0 is different from what we - * need here. However, LLVM also assumes that ffs(x) is - * in [0, 31], but GLSL expects that ffs(0) = -1, so - * a conditional assignment to handle 0 is still required. - * - * The hardware already implements the correct behavior. - */ - ctx->i1true, - }; - - LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, - params, 2, - AC_FUNC_ATTR_READNONE); - - if (src0_bitsize == 64) { - lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); - } else if (src0_bitsize < 32) { - lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); - } - - /* TODO: We need an intrinsic to skip this conditional. */ - /* Check for zero: */ - return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, - LLVMIntEQ, src0, - zero, ""), - LLVMConstInt(ctx->i32, -1, 0), lsb, ""); + unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768; + ctx->lds = LLVMBuildIntToPtr( + ctx->builder, ctx->i32_0, + LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds"); +} + +LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr) +{ + return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), ""); +} + +void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value) +{ + value = ac_to_integer(ctx, value); + ac_build_indexed_store(ctx, ctx->lds, dw_addr, value); +} + +LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0) +{ + unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0)); + const char *intrin_name; + LLVMTypeRef type; + LLVMValueRef zero; + + switch (src0_bitsize) { + case 64: + intrin_name = "llvm.cttz.i64"; + type = ctx->i64; + zero = ctx->i64_0; + break; + case 32: + intrin_name = "llvm.cttz.i32"; + type = ctx->i32; + zero = ctx->i32_0; + break; + case 16: + intrin_name = "llvm.cttz.i16"; + type = ctx->i16; + zero = ctx->i16_0; + break; + case 8: + intrin_name = "llvm.cttz.i8"; + type = ctx->i8; + zero = ctx->i8_0; + break; + default: + unreachable(!"invalid bitsize"); + } + + LLVMValueRef params[2] = { + src0, + + /* The value of 1 means that ffs(x=0) = undef, so LLVM won't + * add special code to check for x=0. The reason is that + * the LLVM behavior for x=0 is different from what we + * need here. However, LLVM also assumes that ffs(x) is + * in [0, 31], but GLSL expects that ffs(0) = -1, so + * a conditional assignment to handle 0 is still required. + * + * The hardware already implements the correct behavior. + */ + ctx->i1true, + }; + + LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE); + + if (src0_bitsize == 64) { + lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, ""); + } else if (src0_bitsize < 32) { + lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, ""); + } + + /* TODO: We need an intrinsic to skip this conditional. */ + /* Check for zero: */ + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""), + LLVMConstInt(ctx->i32, -1, 0), lsb, ""); } LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type) { - return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST); + return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST); } LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type) { - return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT); + return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT); } -static struct ac_llvm_flow * -get_current_flow(struct ac_llvm_context *ctx) +static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx) { - if (ctx->flow->depth > 0) - return &ctx->flow->stack[ctx->flow->depth - 1]; - return NULL; + if (ctx->flow->depth > 0) + return &ctx->flow->stack[ctx->flow->depth - 1]; + return NULL; } -static struct ac_llvm_flow * -get_innermost_loop(struct ac_llvm_context *ctx) +static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx) { - for (unsigned i = ctx->flow->depth; i > 0; --i) { - if (ctx->flow->stack[i - 1].loop_entry_block) - return &ctx->flow->stack[i - 1]; - } - return NULL; + for (unsigned i = ctx->flow->depth; i > 0; --i) { + if (ctx->flow->stack[i - 1].loop_entry_block) + return &ctx->flow->stack[i - 1]; + } + return NULL; } -static struct ac_llvm_flow * -push_flow(struct ac_llvm_context *ctx) +static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx) { - struct ac_llvm_flow *flow; + struct ac_llvm_flow *flow; - if (ctx->flow->depth >= ctx->flow->depth_max) { - unsigned new_max = MAX2(ctx->flow->depth << 1, - AC_LLVM_INITIAL_CF_DEPTH); + if (ctx->flow->depth >= ctx->flow->depth_max) { + unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH); - ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack)); - ctx->flow->depth_max = new_max; - } + ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack)); + ctx->flow->depth_max = new_max; + } - flow = &ctx->flow->stack[ctx->flow->depth]; - ctx->flow->depth++; + flow = &ctx->flow->stack[ctx->flow->depth]; + ctx->flow->depth++; - flow->next_block = NULL; - flow->loop_entry_block = NULL; - return flow; + flow->next_block = NULL; + flow->loop_entry_block = NULL; + return flow; } -static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, - int label_id) +static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id) { - char buf[32]; - snprintf(buf, sizeof(buf), "%s%d", base, label_id); - LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); + char buf[32]; + snprintf(buf, sizeof(buf), "%s%d", base, label_id); + LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf); } /* Append a basic block at the level of the parent flow. */ -static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, - const char *name) +static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name) { - assert(ctx->flow->depth >= 1); + assert(ctx->flow->depth >= 1); - if (ctx->flow->depth >= 2) { - struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2]; + if (ctx->flow->depth >= 2) { + struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2]; - return LLVMInsertBasicBlockInContext(ctx->context, - flow->next_block, name); - } + return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name); + } - LLVMValueRef main_fn = - LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); - return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); + LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); + return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name); } /* Emit a branch to the given default target for the current block if * applicable -- that is, if the current block does not already contain a * branch from a break or continue. */ -static void emit_default_branch(LLVMBuilderRef builder, - LLVMBasicBlockRef target) +static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target) { - if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) - LLVMBuildBr(builder, target); + if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder))) + LLVMBuildBr(builder, target); } void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id) { - struct ac_llvm_flow *flow = push_flow(ctx); - flow->loop_entry_block = append_basic_block(ctx, "LOOP"); - flow->next_block = append_basic_block(ctx, "ENDLOOP"); - set_basicblock_name(flow->loop_entry_block, "loop", label_id); - LLVMBuildBr(ctx->builder, flow->loop_entry_block); - LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); + struct ac_llvm_flow *flow = push_flow(ctx); + flow->loop_entry_block = append_basic_block(ctx, "LOOP"); + flow->next_block = append_basic_block(ctx, "ENDLOOP"); + set_basicblock_name(flow->loop_entry_block, "loop", label_id); + LLVMBuildBr(ctx->builder, flow->loop_entry_block); + LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block); } void ac_build_break(struct ac_llvm_context *ctx) { - struct ac_llvm_flow *flow = get_innermost_loop(ctx); - LLVMBuildBr(ctx->builder, flow->next_block); + struct ac_llvm_flow *flow = get_innermost_loop(ctx); + LLVMBuildBr(ctx->builder, flow->next_block); } void ac_build_continue(struct ac_llvm_context *ctx) { - struct ac_llvm_flow *flow = get_innermost_loop(ctx); - LLVMBuildBr(ctx->builder, flow->loop_entry_block); + struct ac_llvm_flow *flow = get_innermost_loop(ctx); + LLVMBuildBr(ctx->builder, flow->loop_entry_block); } void ac_build_else(struct ac_llvm_context *ctx, int label_id) { - struct ac_llvm_flow *current_branch = get_current_flow(ctx); - LLVMBasicBlockRef endif_block; + struct ac_llvm_flow *current_branch = get_current_flow(ctx); + LLVMBasicBlockRef endif_block; - assert(!current_branch->loop_entry_block); + assert(!current_branch->loop_entry_block); - endif_block = append_basic_block(ctx, "ENDIF"); - emit_default_branch(ctx->builder, endif_block); + endif_block = append_basic_block(ctx, "ENDIF"); + emit_default_branch(ctx->builder, endif_block); - LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); - set_basicblock_name(current_branch->next_block, "else", label_id); + LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); + set_basicblock_name(current_branch->next_block, "else", label_id); - current_branch->next_block = endif_block; + current_branch->next_block = endif_block; } void ac_build_endif(struct ac_llvm_context *ctx, int label_id) { - struct ac_llvm_flow *current_branch = get_current_flow(ctx); + struct ac_llvm_flow *current_branch = get_current_flow(ctx); - assert(!current_branch->loop_entry_block); + assert(!current_branch->loop_entry_block); - emit_default_branch(ctx->builder, current_branch->next_block); - LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); - set_basicblock_name(current_branch->next_block, "endif", label_id); + emit_default_branch(ctx->builder, current_branch->next_block); + LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block); + set_basicblock_name(current_branch->next_block, "endif", label_id); - ctx->flow->depth--; + ctx->flow->depth--; } void ac_build_endloop(struct ac_llvm_context *ctx, int label_id) { - struct ac_llvm_flow *current_loop = get_current_flow(ctx); + struct ac_llvm_flow *current_loop = get_current_flow(ctx); - assert(current_loop->loop_entry_block); + assert(current_loop->loop_entry_block); - emit_default_branch(ctx->builder, current_loop->loop_entry_block); + emit_default_branch(ctx->builder, current_loop->loop_entry_block); - LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); - set_basicblock_name(current_loop->next_block, "endloop", label_id); - ctx->flow->depth--; + LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block); + set_basicblock_name(current_loop->next_block, "endloop", label_id); + ctx->flow->depth--; } void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id) { - struct ac_llvm_flow *flow = push_flow(ctx); - LLVMBasicBlockRef if_block; + struct ac_llvm_flow *flow = push_flow(ctx); + LLVMBasicBlockRef if_block; - if_block = append_basic_block(ctx, "IF"); - flow->next_block = append_basic_block(ctx, "ELSE"); - set_basicblock_name(if_block, "if", label_id); - LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); - LLVMPositionBuilderAtEnd(ctx->builder, if_block); + if_block = append_basic_block(ctx, "IF"); + flow->next_block = append_basic_block(ctx, "ELSE"); + set_basicblock_name(if_block, "if", label_id); + LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block); + LLVMPositionBuilderAtEnd(ctx->builder, if_block); } -void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, - int label_id) +void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, int label_id) { - LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, - value, ctx->f32_0, ""); - ac_build_ifcc(ctx, cond, label_id); + LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, value, ctx->f32_0, ""); + ac_build_ifcc(ctx, cond, label_id); } -void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, - int label_id) +void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, int label_id) { - LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE, - ac_to_integer(ctx, value), - ctx->i32_0, ""); - ac_build_ifcc(ctx, cond, label_id); + LLVMValueRef cond = + LLVMBuildICmp(ctx->builder, LLVMIntNE, ac_to_integer(ctx, value), ctx->i32_0, ""); + ac_build_ifcc(ctx, cond, label_id); } -LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, - const char *name) +LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name) { - LLVMBuilderRef builder = ac->builder; - LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); - LLVMValueRef function = LLVMGetBasicBlockParent(current_block); - LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); - LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); - LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); - LLVMValueRef res; + LLVMBuilderRef builder = ac->builder; + LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder); + LLVMValueRef function = LLVMGetBasicBlockParent(current_block); + LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function); + LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block); + LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context); + LLVMValueRef res; - if (first_instr) { - LLVMPositionBuilderBefore(first_builder, first_instr); - } else { - LLVMPositionBuilderAtEnd(first_builder, first_block); - } + if (first_instr) { + LLVMPositionBuilderBefore(first_builder, first_instr); + } else { + LLVMPositionBuilderAtEnd(first_builder, first_block); + } - res = LLVMBuildAlloca(first_builder, type, name); - LLVMDisposeBuilder(first_builder); - return res; + res = LLVMBuildAlloca(first_builder, type, name); + LLVMDisposeBuilder(first_builder); + return res; } -LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, - LLVMTypeRef type, const char *name) +LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name) { - LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); - LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); - return ptr; + LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name); + LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr); + return ptr; } -LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMTypeRef type) +LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type) { - int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - return LLVMBuildBitCast(ctx->builder, ptr, - LLVMPointerType(type, addr_space), ""); + int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), ""); } -LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, - unsigned count) +LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count) { - unsigned num_components = ac_get_llvm_num_components(value); - if (count == num_components) - return value; + unsigned num_components = ac_get_llvm_num_components(value); + if (count == num_components) + return value; - LLVMValueRef masks[MAX2(count, 2)]; - masks[0] = ctx->i32_0; - masks[1] = ctx->i32_1; - for (unsigned i = 2; i < count; i++) - masks[i] = LLVMConstInt(ctx->i32, i, false); + LLVMValueRef masks[MAX2(count, 2)]; + masks[0] = ctx->i32_0; + masks[1] = ctx->i32_1; + for (unsigned i = 2; i < count; i++) + masks[i] = LLVMConstInt(ctx->i32, i, false); - if (count == 1) - return LLVMBuildExtractElement(ctx->builder, value, masks[0], - ""); + if (count == 1) + return LLVMBuildExtractElement(ctx->builder, value, masks[0], ""); - LLVMValueRef swizzle = LLVMConstVector(masks, count); - return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); + LLVMValueRef swizzle = LLVMConstVector(masks, count); + return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, ""); } -LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, - unsigned rshift, unsigned bitwidth) +LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift, + unsigned bitwidth) { - LLVMValueRef value = param; - if (rshift) - value = LLVMBuildLShr(ctx->builder, value, - LLVMConstInt(ctx->i32, rshift, false), ""); + LLVMValueRef value = param; + if (rshift) + value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(ctx->i32, rshift, false), ""); - if (rshift + bitwidth < 32) { - unsigned mask = (1 << bitwidth) - 1; - value = LLVMBuildAnd(ctx->builder, value, - LLVMConstInt(ctx->i32, mask, false), ""); - } - return value; + if (rshift + bitwidth < 32) { + unsigned mask = (1 << bitwidth) - 1; + value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(ctx->i32, mask, false), ""); + } + return value; } /* Adjust the sample index according to FMASK. @@ -3561,108 +3184,96 @@ LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, * The sample index should be adjusted as follows: * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF; */ -void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, - LLVMValueRef *addr, bool is_array_tex) -{ - struct ac_image_args fmask_load = {}; - fmask_load.opcode = ac_image_load; - fmask_load.resource = fmask; - fmask_load.dmask = 0xf; - fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; - fmask_load.attributes = AC_FUNC_ATTR_READNONE; - - fmask_load.coords[0] = addr[0]; - fmask_load.coords[1] = addr[1]; - if (is_array_tex) - fmask_load.coords[2] = addr[2]; - - LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); - fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, - ac->i32_0, ""); - - /* Apply the formula. */ - unsigned sample_chan = is_array_tex ? 3 : 2; - LLVMValueRef final_sample; - final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], - LLVMConstInt(ac->i32, 4, 0), ""); - final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, ""); - /* Mask the sample index by 0x7, because 0x8 means an unknown value - * with EQAA, so those will map to 0. */ - final_sample = LLVMBuildAnd(ac->builder, final_sample, - LLVMConstInt(ac->i32, 0x7, 0), ""); - - /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK - * resource descriptor is 0 (invalid). - */ - LLVMValueRef tmp; - tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); - tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); - tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); - - /* Replace the MSAA sample index. */ - addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, - addr[sample_chan], ""); -} - -static LLVMValueRef -_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, - LLVMValueRef lane, bool with_opt_barrier) -{ - LLVMTypeRef type = LLVMTypeOf(src); - LLVMValueRef result; - - if (with_opt_barrier) - ac_build_optimization_barrier(ctx, &src); - - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - if (lane) - lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, ""); - - result = ac_build_intrinsic(ctx, - lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", - ctx->i32, (LLVMValueRef []) { src, lane }, - lane == NULL ? 1 : 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); - - return LLVMBuildTrunc(ctx->builder, result, type, ""); -} - -static LLVMValueRef -ac_build_readlane_common(struct ac_llvm_context *ctx, - LLVMValueRef src, LLVMValueRef lane, - bool with_opt_barrier) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - - if (bits > 32) { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - LLVMValueRef ret_comp; - - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, 0), ""); - - ret_comp = _ac_build_readlane(ctx, src, lane, - with_opt_barrier); - - ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp, - LLVMConstInt(ctx->i32, i, 0), ""); - } - } else { - ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier); - } - - if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) - return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr, + bool is_array_tex) +{ + struct ac_image_args fmask_load = {}; + fmask_load.opcode = ac_image_load; + fmask_load.resource = fmask; + fmask_load.dmask = 0xf; + fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d; + fmask_load.attributes = AC_FUNC_ATTR_READNONE; + + fmask_load.coords[0] = addr[0]; + fmask_load.coords[1] = addr[1]; + if (is_array_tex) + fmask_load.coords[2] = addr[2]; + + LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); + fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, ""); + + /* Apply the formula. */ + unsigned sample_chan = is_array_tex ? 3 : 2; + LLVMValueRef final_sample; + final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], LLVMConstInt(ac->i32, 4, 0), ""); + final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, ""); + /* Mask the sample index by 0x7, because 0x8 means an unknown value + * with EQAA, so those will map to 0. */ + final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), ""); + + /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK + * resource descriptor is 0 (invalid). + */ + LLVMValueRef tmp; + tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, ""); + tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, ""); + tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, ""); + + /* Replace the MSAA sample index. */ + addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], ""); +} + +static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane, bool with_opt_barrier) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + if (with_opt_barrier) + ac_build_optimization_barrier(ctx, &src); + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + if (lane) + lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, ""); + + result = + ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane", + ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, result, type, ""); +} + +static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane, bool with_opt_barrier) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + LLVMValueRef ret_comp; + + src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); + + ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier); + + ret = + LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); + } + } else { + ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier); + } + + if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind) + return LLVMBuildIntToPtr(ctx->builder, ret, src_type, ""); + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); } /** @@ -3676,429 +3287,433 @@ ac_build_readlane_common(struct ac_llvm_context *ctx, * @param lane - id of the lane or NULL for the first active lane * @return value of the lane */ -LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, - LLVMValueRef src, LLVMValueRef lane) -{ - return ac_build_readlane_common(ctx, src, lane, false); -} - - -LLVMValueRef -ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) -{ - return ac_build_readlane_common(ctx, src, lane, true); -} - -LLVMValueRef -ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane) -{ - return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32, - (LLVMValueRef []) {value, lane, src}, 3, - AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); -} - -LLVMValueRef -ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) -{ - if (ctx->wave_size == 32) { - return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, - (LLVMValueRef []) { mask, ctx->i32_0 }, - 2, AC_FUNC_ATTR_READNONE); - } - LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, ""); - LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, - ctx->i32_0, ""); - LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, - ctx->i32_1, ""); - LLVMValueRef val = - ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, - (LLVMValueRef []) { mask_lo, ctx->i32_0 }, - 2, AC_FUNC_ATTR_READNONE); - val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, - (LLVMValueRef []) { mask_hi, val }, - 2, AC_FUNC_ATTR_READNONE); - return val; -} - -enum dpp_ctrl { - _dpp_quad_perm = 0x000, - _dpp_row_sl = 0x100, - _dpp_row_sr = 0x110, - _dpp_row_rr = 0x120, - dpp_wf_sl1 = 0x130, - dpp_wf_rl1 = 0x134, - dpp_wf_sr1 = 0x138, - dpp_wf_rr1 = 0x13C, - dpp_row_mirror = 0x140, - dpp_row_half_mirror = 0x141, - dpp_row_bcast15 = 0x142, - dpp_row_bcast31 = 0x143 +LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane) +{ + return ac_build_readlane_common(ctx, src, lane, false); +} + +LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane) +{ + return ac_build_readlane_common(ctx, src, lane, true); +} + +LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, + LLVMValueRef lane) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32, + (LLVMValueRef[]){value, lane, src}, 3, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); +} + +LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) +{ + if (ctx->wave_size == 32) { + return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, + (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE); + } + LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, ""); + LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, ""); + LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, ""); + LLVMValueRef val = + ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, + (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE); + val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val}, + 2, AC_FUNC_ATTR_READNONE); + return val; +} + +enum dpp_ctrl +{ + _dpp_quad_perm = 0x000, + _dpp_row_sl = 0x100, + _dpp_row_sr = 0x110, + _dpp_row_rr = 0x120, + dpp_wf_sl1 = 0x130, + dpp_wf_rl1 = 0x134, + dpp_wf_sr1 = 0x138, + dpp_wf_rr1 = 0x13C, + dpp_row_mirror = 0x140, + dpp_row_half_mirror = 0x141, + dpp_row_bcast15 = 0x142, + dpp_row_bcast31 = 0x143 }; -static inline enum dpp_ctrl -dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) -{ - assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); - return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); -} - -static inline enum dpp_ctrl -dpp_row_sl(unsigned amount) -{ - assert(amount > 0 && amount < 16); - return _dpp_row_sl | amount; -} - -static inline enum dpp_ctrl -dpp_row_sr(unsigned amount) -{ - assert(amount > 0 && amount < 16); - return _dpp_row_sr | amount; -} - -static LLVMValueRef -_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, - enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, - bool bound_ctrl) -{ - LLVMTypeRef type = LLVMTypeOf(src); - LLVMValueRef res; - - old = LLVMBuildZExt(ctx->builder, old, ctx->i32, ""); - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - - res = ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32, - (LLVMValueRef[]) { - old, src, - LLVMConstInt(ctx->i32, dpp_ctrl, 0), - LLVMConstInt(ctx->i32, row_mask, 0), - LLVMConstInt(ctx->i32, bank_mask, 0), - LLVMConstInt(ctx->i1, bound_ctrl, 0) }, - 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); - - return LLVMBuildTrunc(ctx->builder, res, type, ""); -} - -static LLVMValueRef -ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, - enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, - bool bound_ctrl) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - old = ac_to_integer(ctx, old); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - if (bits > 32) { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - LLVMValueRef old_vector = - LLVMBuildBitCast(ctx->builder, old, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - old = LLVMBuildExtractElement(ctx->builder, old_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src, - dpp_ctrl, - row_mask, - bank_mask, - bound_ctrl); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ret_comp, - LLVMConstInt(ctx->i32, i, - 0), ""); - } - } else { - ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, - bank_mask, bound_ctrl); - } - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static LLVMValueRef -_ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, - bool exchange_rows, bool bound_ctrl) -{ - LLVMTypeRef type = LLVMTypeOf(src); - LLVMValueRef result; - - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - - LLVMValueRef args[6] = { - src, - src, - LLVMConstInt(ctx->i32, sel, false), - LLVMConstInt(ctx->i32, sel >> 32, false), - ctx->i1true, /* fi */ - bound_ctrl ? ctx->i1true : ctx->i1false, - }; - - result = ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" - : "llvm.amdgcn.permlane16", - ctx->i32, args, 6, - AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); - - return LLVMBuildTrunc(ctx->builder, result, type, ""); -} - -static LLVMValueRef -ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, - bool exchange_rows, bool bound_ctrl) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - if (bits > 32) { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - LLVMValueRef ret_comp = - _ac_build_permlane16(ctx, src, sel, - exchange_rows, - bound_ctrl); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ret_comp, - LLVMConstInt(ctx->i32, i, - 0), ""); - } - } else { - ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, - bound_ctrl); - } - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static inline unsigned -ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) -{ - assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); - return and_mask | (or_mask << 5) | (xor_mask << 10); -} - -static LLVMValueRef -_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - LLVMValueRef ret; - - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - - ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32, - (LLVMValueRef []) { - src, LLVMConstInt(ctx->i32, mask, 0) }, - 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); - - return LLVMBuildTrunc(ctx->builder, ret, src_type, ""); -} - -LLVMValueRef -ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - src = ac_to_integer(ctx, src); - unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); - LLVMValueRef ret; - if (bits > 32) { - assert(bits % 32 == 0); - LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); - LLVMValueRef src_vector = - LLVMBuildBitCast(ctx->builder, src, vec_type, ""); - ret = LLVMGetUndef(vec_type); - for (unsigned i = 0; i < bits / 32; i++) { - src = LLVMBuildExtractElement(ctx->builder, src_vector, - LLVMConstInt(ctx->i32, i, - 0), ""); - LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, - mask); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ret_comp, - LLVMConstInt(ctx->i32, i, - 0), ""); - } - } else { - ret = _ac_build_ds_swizzle(ctx, src, mask); - } - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static LLVMValueRef -ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) -{ - LLVMTypeRef src_type = LLVMTypeOf(src); - unsigned bitsize = ac_get_elem_bits(ctx, src_type); - char name[32], type[8]; - LLVMValueRef ret; - - src = ac_to_integer(ctx, src); - - if (bitsize < 32) - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - - ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); - ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), - (LLVMValueRef []) { src }, 1, - AC_FUNC_ATTR_READNONE); - - if (bitsize < 32) - ret = LLVMBuildTrunc(ctx->builder, ret, - ac_to_integer_type(ctx, src_type), ""); - - return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); -} - -static LLVMValueRef -ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, - LLVMValueRef inactive) -{ - char name[33], type[8]; - LLVMTypeRef src_type = LLVMTypeOf(src); - unsigned bitsize = ac_get_elem_bits(ctx, src_type); - src = ac_to_integer(ctx, src); - inactive = ac_to_integer(ctx, inactive); - - if (bitsize < 32) { - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, ""); - } - - ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); - snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); - LLVMValueRef ret = - ac_build_intrinsic(ctx, name, - LLVMTypeOf(src), (LLVMValueRef []) { - src, inactive }, 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); - if (bitsize < 32) - ret = LLVMBuildTrunc(ctx->builder, ret, src_type, ""); - - return ret; -} - -static LLVMValueRef -get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size) -{ - if (type_size == 1) { - switch (op) { - case nir_op_iadd: return ctx->i8_0; - case nir_op_imul: return ctx->i8_1; - case nir_op_imin: return LLVMConstInt(ctx->i8, INT8_MAX, 0); - case nir_op_umin: return LLVMConstInt(ctx->i8, UINT8_MAX, 0); - case nir_op_imax: return LLVMConstInt(ctx->i8, INT8_MIN, 0); - case nir_op_umax: return ctx->i8_0; - case nir_op_iand: return LLVMConstInt(ctx->i8, -1, 0); - case nir_op_ior: return ctx->i8_0; - case nir_op_ixor: return ctx->i8_0; - default: - unreachable("bad reduction intrinsic"); - } - } else if (type_size == 2) { - switch (op) { - case nir_op_iadd: return ctx->i16_0; - case nir_op_fadd: return ctx->f16_0; - case nir_op_imul: return ctx->i16_1; - case nir_op_fmul: return ctx->f16_1; - case nir_op_imin: return LLVMConstInt(ctx->i16, INT16_MAX, 0); - case nir_op_umin: return LLVMConstInt(ctx->i16, UINT16_MAX, 0); - case nir_op_fmin: return LLVMConstReal(ctx->f16, INFINITY); - case nir_op_imax: return LLVMConstInt(ctx->i16, INT16_MIN, 0); - case nir_op_umax: return ctx->i16_0; - case nir_op_fmax: return LLVMConstReal(ctx->f16, -INFINITY); - case nir_op_iand: return LLVMConstInt(ctx->i16, -1, 0); - case nir_op_ior: return ctx->i16_0; - case nir_op_ixor: return ctx->i16_0; - default: - unreachable("bad reduction intrinsic"); - } - } else if (type_size == 4) { - switch (op) { - case nir_op_iadd: return ctx->i32_0; - case nir_op_fadd: return ctx->f32_0; - case nir_op_imul: return ctx->i32_1; - case nir_op_fmul: return ctx->f32_1; - case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0); - case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0); - case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY); - case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0); - case nir_op_umax: return ctx->i32_0; - case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY); - case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0); - case nir_op_ior: return ctx->i32_0; - case nir_op_ixor: return ctx->i32_0; - default: - unreachable("bad reduction intrinsic"); - } - } else { /* type_size == 64bit */ - switch (op) { - case nir_op_iadd: return ctx->i64_0; - case nir_op_fadd: return ctx->f64_0; - case nir_op_imul: return ctx->i64_1; - case nir_op_fmul: return ctx->f64_1; - case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0); - case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0); - case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY); - case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0); - case nir_op_umax: return ctx->i64_0; - case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY); - case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0); - case nir_op_ior: return ctx->i64_0; - case nir_op_ixor: return ctx->i64_0; - default: - unreachable("bad reduction intrinsic"); - } - } -} - -static LLVMValueRef -ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op) -{ - bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; - bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4; - switch (op) { - case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); - case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); - case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, ""); - case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); - case nir_op_imin: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_umin: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_fmin: return ac_build_intrinsic(ctx, - _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16", - _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, - (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); - case nir_op_imax: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_umax: return LLVMBuildSelect(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), - lhs, rhs, ""); - case nir_op_fmax: return ac_build_intrinsic(ctx, - _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16", - _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, - (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE); - case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); - case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, ""); - case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, ""); - default: - unreachable("bad reduction intrinsic"); - } +static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, + unsigned lane3) +{ + assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); + return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); +} + +static inline enum dpp_ctrl dpp_row_sl(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sl | amount; +} + +static inline enum dpp_ctrl dpp_row_sr(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sr | amount; +} + +static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, + enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef res; + + old = LLVMBuildZExt(ctx->builder, old, ctx->i32, ""); + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + res = ac_build_intrinsic( + ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32, + (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0), + LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0), + LLVMConstInt(ctx->i1, bound_ctrl, 0)}, + 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, res, type, ""); +} + +static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, + enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + old = ac_to_integer(ctx, old); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); + old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), ""); + LLVMValueRef ret_comp = + _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + ret = + LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); + } + } else { + ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, + uint64_t sel, bool exchange_rows, bool bound_ctrl) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + LLVMValueRef args[6] = { + src, + src, + LLVMConstInt(ctx->i32, sel, false), + LLVMConstInt(ctx->i32, sel >> 32, false), + ctx->i1true, /* fi */ + bound_ctrl ? ctx->i1true : ctx->i1false, + }; + + result = + ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16", + ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, result, type, ""); +} + +static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel, + bool exchange_rows, bool bound_ctrl) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); + LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl); + ret = + LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); + } + } else { + ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl); + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) +{ + assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); + return and_mask | (or_mask << 5) | (xor_mask << 10); +} + +static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned mask) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + LLVMValueRef ret; + + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32, + (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + return LLVMBuildTrunc(ctx->builder, ret, src_type, ""); +} + +LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits > 32) { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), ""); + LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask); + ret = + LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), ""); + } + } else { + ret = _ac_build_ds_swizzle(ctx, src, mask); + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned bitsize = ac_get_elem_bits(ctx, src_type); + char name[32], type[8]; + LLVMValueRef ret; + + src = ac_to_integer(ctx, src); + + if (bitsize < 32) + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); + ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, + AC_FUNC_ATTR_READNONE); + + if (bitsize < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), ""); + + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef inactive) +{ + char name[33], type[8]; + LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned bitsize = ac_get_elem_bits(ctx, src_type); + src = ac_to_integer(ctx, src); + inactive = ac_to_integer(ctx, inactive); + + if (bitsize < 32) { + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, ""); + } + + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); + LLVMValueRef ret = + ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + if (bitsize < 32) + ret = LLVMBuildTrunc(ctx->builder, ret, src_type, ""); + + return ret; +} + +static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, + unsigned type_size) +{ + if (type_size == 1) { + switch (op) { + case nir_op_iadd: + return ctx->i8_0; + case nir_op_imul: + return ctx->i8_1; + case nir_op_imin: + return LLVMConstInt(ctx->i8, INT8_MAX, 0); + case nir_op_umin: + return LLVMConstInt(ctx->i8, UINT8_MAX, 0); + case nir_op_imax: + return LLVMConstInt(ctx->i8, INT8_MIN, 0); + case nir_op_umax: + return ctx->i8_0; + case nir_op_iand: + return LLVMConstInt(ctx->i8, -1, 0); + case nir_op_ior: + return ctx->i8_0; + case nir_op_ixor: + return ctx->i8_0; + default: + unreachable("bad reduction intrinsic"); + } + } else if (type_size == 2) { + switch (op) { + case nir_op_iadd: + return ctx->i16_0; + case nir_op_fadd: + return ctx->f16_0; + case nir_op_imul: + return ctx->i16_1; + case nir_op_fmul: + return ctx->f16_1; + case nir_op_imin: + return LLVMConstInt(ctx->i16, INT16_MAX, 0); + case nir_op_umin: + return LLVMConstInt(ctx->i16, UINT16_MAX, 0); + case nir_op_fmin: + return LLVMConstReal(ctx->f16, INFINITY); + case nir_op_imax: + return LLVMConstInt(ctx->i16, INT16_MIN, 0); + case nir_op_umax: + return ctx->i16_0; + case nir_op_fmax: + return LLVMConstReal(ctx->f16, -INFINITY); + case nir_op_iand: + return LLVMConstInt(ctx->i16, -1, 0); + case nir_op_ior: + return ctx->i16_0; + case nir_op_ixor: + return ctx->i16_0; + default: + unreachable("bad reduction intrinsic"); + } + } else if (type_size == 4) { + switch (op) { + case nir_op_iadd: + return ctx->i32_0; + case nir_op_fadd: + return ctx->f32_0; + case nir_op_imul: + return ctx->i32_1; + case nir_op_fmul: + return ctx->f32_1; + case nir_op_imin: + return LLVMConstInt(ctx->i32, INT32_MAX, 0); + case nir_op_umin: + return LLVMConstInt(ctx->i32, UINT32_MAX, 0); + case nir_op_fmin: + return LLVMConstReal(ctx->f32, INFINITY); + case nir_op_imax: + return LLVMConstInt(ctx->i32, INT32_MIN, 0); + case nir_op_umax: + return ctx->i32_0; + case nir_op_fmax: + return LLVMConstReal(ctx->f32, -INFINITY); + case nir_op_iand: + return LLVMConstInt(ctx->i32, -1, 0); + case nir_op_ior: + return ctx->i32_0; + case nir_op_ixor: + return ctx->i32_0; + default: + unreachable("bad reduction intrinsic"); + } + } else { /* type_size == 64bit */ + switch (op) { + case nir_op_iadd: + return ctx->i64_0; + case nir_op_fadd: + return ctx->f64_0; + case nir_op_imul: + return ctx->i64_1; + case nir_op_fmul: + return ctx->f64_1; + case nir_op_imin: + return LLVMConstInt(ctx->i64, INT64_MAX, 0); + case nir_op_umin: + return LLVMConstInt(ctx->i64, UINT64_MAX, 0); + case nir_op_fmin: + return LLVMConstReal(ctx->f64, INFINITY); + case nir_op_imax: + return LLVMConstInt(ctx->i64, INT64_MIN, 0); + case nir_op_umax: + return ctx->i64_0; + case nir_op_fmax: + return LLVMConstReal(ctx->f64, -INFINITY); + case nir_op_iand: + return LLVMConstInt(ctx->i64, -1, 0); + case nir_op_ior: + return ctx->i64_0; + case nir_op_ixor: + return ctx->i64_0; + default: + unreachable("bad reduction intrinsic"); + } + } +} + +static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, + nir_op op) +{ + bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8; + bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4; + switch (op) { + case nir_op_iadd: + return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); + case nir_op_fadd: + return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); + case nir_op_imul: + return LLVMBuildMul(ctx->builder, lhs, rhs, ""); + case nir_op_fmul: + return LLVMBuildFMul(ctx->builder, lhs, rhs, ""); + case nir_op_imin: + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_umin: + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_fmin: + return ac_build_intrinsic( + ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16", + _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, + AC_FUNC_ATTR_READNONE); + case nir_op_imax: + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_umax: + return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""), + lhs, rhs, ""); + case nir_op_fmax: + return ac_build_intrinsic( + ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16", + _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, + AC_FUNC_ATTR_READNONE); + case nir_op_iand: + return LLVMBuildAnd(ctx->builder, lhs, rhs, ""); + case nir_op_ior: + return LLVMBuildOr(ctx->builder, lhs, rhs, ""); + case nir_op_ixor: + return LLVMBuildXor(ctx->builder, lhs, rhs, ""); + default: + unreachable("bad reduction intrinsic"); + } } /** @@ -4108,297 +3723,292 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, * prefix of this many threads * \return src, shifted 1 lane up, and identity shifted into lane 0. */ -static LLVMValueRef -ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src, - LLVMValueRef identity, unsigned maxprefix) -{ - if (ctx->chip_class >= GFX10) { - /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ - LLVMValueRef active, tmp1, tmp2; - LLVMValueRef tid = ac_get_thread_id(ctx); - - tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); - - tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); - - if (maxprefix > 32) { - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - - tmp2 = LLVMBuildSelect(ctx->builder, active, - ac_build_readlane(ctx, src, - LLVMConstInt(ctx->i32, 31, false)), - tmp2, ""); - - active = LLVMBuildOr(ctx->builder, active, - LLVMBuildICmp(ctx->builder, LLVMIntEQ, - LLVMBuildAnd(ctx->builder, tid, - LLVMConstInt(ctx->i32, 0x1f, false), ""), - LLVMConstInt(ctx->i32, 0x10, false), ""), ""); - return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - } else if (maxprefix > 16) { - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, - LLVMConstInt(ctx->i32, 16, false), ""); - - return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - } - } else if (ctx->chip_class >= GFX8) { - return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); - } - - /* wavefront shift_right by 1 on SI/CI */ - LLVMValueRef active, tmp1, tmp2; - LLVMValueRef tid = ac_get_thread_id(ctx); - tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); - tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""), - LLVMConstInt(ctx->i32, 0x4, 0), ""); - tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""), - LLVMConstInt(ctx->i32, 0x8, 0), ""); - tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""), - LLVMConstInt(ctx->i32, 0x10, 0), ""); - tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0)); - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), ""); - tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); - active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), ""); - return LLVMBuildSelect(ctx->builder, active, identity, tmp1, ""); +static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef identity, unsigned maxprefix) +{ + if (ctx->chip_class >= GFX10) { + /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + + tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + + tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false); + + if (maxprefix > 32) { + active = + LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), ""); + + tmp2 = LLVMBuildSelect(ctx->builder, active, + ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)), + tmp2, ""); + + active = LLVMBuildOr( + ctx->builder, active, + LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""), + LLVMConstInt(ctx->i32, 0x10, false), ""), + ""); + return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } else if (maxprefix > 16) { + active = + LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), ""); + + return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + } + } else if (ctx->chip_class >= GFX8) { + return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false); + } + + /* wavefront shift_right by 1 on SI/CI */ + LLVMValueRef active, tmp1, tmp2; + LLVMValueRef tid = ac_get_thread_id(ctx); + tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""), + LLVMConstInt(ctx->i32, 0x4, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""), + LLVMConstInt(ctx->i32, 0x8, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""), + LLVMConstInt(ctx->i32, 0x10, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0)); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), ""); + tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, ""); + active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), ""); + return LLVMBuildSelect(ctx->builder, active, identity, tmp1, ""); } /** * \param maxprefix specifies that the result only needs to be correct for a * prefix of this many threads */ -static LLVMValueRef -ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity, - unsigned maxprefix, bool inclusive) -{ - LLVMValueRef result, tmp; - - if (!inclusive) - src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix); - - result = src; - - if (ctx->chip_class <= GFX7) { - assert(maxprefix == 64); - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef active; - tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), - ctx->i32_0, ""); - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - result = ac_build_alu_op(ctx, result, tmp, op); - tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""), - ctx->i32_0, ""); - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - result = ac_build_alu_op(ctx, result, tmp, op); - tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""), - ctx->i32_0, ""); - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - result = ac_build_alu_op(ctx, result, tmp, op); - tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""), - ctx->i32_0, ""); - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - result = ac_build_alu_op(ctx, result, tmp, op); - tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00)); - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""), - ctx->i32_0, ""); - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - result = ac_build_alu_op(ctx, result, tmp, op); - tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0)); - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""), - ctx->i32_0, ""); - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - result = ac_build_alu_op(ctx, result, tmp, op); - return result; - } - - if (maxprefix <= 1) - return result; - tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 2) - return result; - tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 3) - return result; - tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 4) - return result; - tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 8) - return result; - tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 16) - return result; - - if (ctx->chip_class >= GFX10) { - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef active; - - tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); - - active = LLVMBuildICmp(ctx->builder, LLVMIntNE, - LLVMBuildAnd(ctx->builder, tid, - LLVMConstInt(ctx->i32, 16, false), ""), - ctx->i32_0, ""); - - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - - result = ac_build_alu_op(ctx, result, tmp, op); - - if (maxprefix <= 32) - return result; - - tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); - - active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, - LLVMConstInt(ctx->i32, 32, false), ""); - - tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); - - result = ac_build_alu_op(ctx, result, tmp, op); - return result; - } - - tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - if (maxprefix <= 32) - return result; - tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); - result = ac_build_alu_op(ctx, result, tmp, op); - return result; -} - -LLVMValueRef -ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) -{ - LLVMValueRef result; - - if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { - LLVMBuilderRef builder = ctx->builder; - src = LLVMBuildZExt(builder, src, ctx->i32, ""); - result = ac_build_ballot(ctx, src); - result = ac_build_mbcnt(ctx, result); - result = LLVMBuildAdd(builder, result, src, ""); - return result; - } - - ac_build_optimization_barrier(ctx, &src); - - LLVMValueRef identity = - get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); - result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), - LLVMTypeOf(identity), ""); - result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true); - - return ac_build_wwm(ctx, result); -} - -LLVMValueRef -ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) -{ - LLVMValueRef result; - - if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { - LLVMBuilderRef builder = ctx->builder; - src = LLVMBuildZExt(builder, src, ctx->i32, ""); - result = ac_build_ballot(ctx, src); - result = ac_build_mbcnt(ctx, result); - return result; - } - - ac_build_optimization_barrier(ctx, &src); - - LLVMValueRef identity = - get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); - result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), - LLVMTypeOf(identity), ""); - result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false); - - return ac_build_wwm(ctx, result); -} - -LLVMValueRef -ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size) -{ - if (cluster_size == 1) return src; - ac_build_optimization_barrier(ctx, &src); - LLVMValueRef result, swap; - LLVMValueRef identity = get_reduction_identity(ctx, op, - ac_get_type_size(LLVMTypeOf(src))); - result = LLVMBuildBitCast(ctx->builder, - ac_build_set_inactive(ctx, src, identity), - LLVMTypeOf(identity), ""); - swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 2) return ac_build_wwm(ctx, result); - - swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 4) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX8) - swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); - else - swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 8) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX8) - swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); - else - swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 16) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX10) - swap = ac_build_permlane16(ctx, result, 0, true, false); - else if (ctx->chip_class >= GFX8 && cluster_size != 32) - swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); - else - swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); - result = ac_build_alu_op(ctx, result, swap, op); - if (cluster_size == 32) return ac_build_wwm(ctx, result); - - if (ctx->chip_class >= GFX8) { - if (ctx->wave_size == 64) { - if (ctx->chip_class >= GFX10) - swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); - else - swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); - result = ac_build_alu_op(ctx, result, swap, op); - result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); - } - - return ac_build_wwm(ctx, result); - } else { - swap = ac_build_readlane(ctx, result, ctx->i32_0); - result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); - result = ac_build_alu_op(ctx, result, swap, op); - return ac_build_wwm(ctx, result); - } +static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, + LLVMValueRef identity, unsigned maxprefix, bool inclusive) +{ + LLVMValueRef result, tmp; + + if (!inclusive) + src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix); + + result = src; + + if (ctx->chip_class <= GFX7) { + assert(maxprefix == 64); + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef active; + tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0)); + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""), + ctx->i32_0, ""); + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + result = ac_build_alu_op(ctx, result, tmp, op); + return result; + } + + if (maxprefix <= 1) + return result; + tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 2) + return result; + tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 3) + return result; + tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 4) + return result; + tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 8) + return result; + tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 16) + return result; + + if (ctx->chip_class >= GFX10) { + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef active; + + tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false); + + active = LLVMBuildICmp(ctx->builder, LLVMIntNE, + LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""), + ctx->i32_0, ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + + if (maxprefix <= 32) + return result; + + tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + + active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), ""); + + tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, ""); + + result = ac_build_alu_op(ctx, result, tmp, op); + return result; + } + + tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + if (maxprefix <= 32) + return result; + tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); + result = ac_build_alu_op(ctx, result, tmp, op); + return result; +} + +LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) +{ + LLVMValueRef result; + + if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { + LLVMBuilderRef builder = ctx->builder; + src = LLVMBuildZExt(builder, src, ctx->i32, ""); + result = ac_build_ballot(ctx, src); + result = ac_build_mbcnt(ctx, result); + result = LLVMBuildAdd(builder, result, src, ""); + return result; + } + + ac_build_optimization_barrier(ctx, &src); + + LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true); + + return ac_build_wwm(ctx, result); +} + +LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op) +{ + LLVMValueRef result; + + if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) { + LLVMBuilderRef builder = ctx->builder; + src = LLVMBuildZExt(builder, src, ctx->i32, ""); + result = ac_build_ballot(ctx, src); + result = ac_build_mbcnt(ctx, result); + return result; + } + + ac_build_optimization_barrier(ctx, &src); + + LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false); + + return ac_build_wwm(ctx, result); +} + +LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, + unsigned cluster_size) +{ + if (cluster_size == 1) + return src; + ac_build_optimization_barrier(ctx, &src); + LLVMValueRef result, swap; + LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src))); + result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity), + LLVMTypeOf(identity), ""); + swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 2) + return ac_build_wwm(ctx, result); + + swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 4) + return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX8) + swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 8) + return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX8) + swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 16) + return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX10) + swap = ac_build_permlane16(ctx, result, 0, true, false); + else if (ctx->chip_class >= GFX8 && cluster_size != 32) + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false); + else + swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10)); + result = ac_build_alu_op(ctx, result, swap, op); + if (cluster_size == 32) + return ac_build_wwm(ctx, result); + + if (ctx->chip_class >= GFX8) { + if (ctx->wave_size == 64) { + if (ctx->chip_class >= GFX10) + swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false)); + else + swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false); + result = ac_build_alu_op(ctx, result, swap, op); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0)); + } + + return ac_build_wwm(ctx, result); + } else { + swap = ac_build_readlane(ctx, result, ctx->i32_0); + result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0)); + result = ac_build_alu_op(ctx, result, swap, op); + return ac_build_wwm(ctx, result); + } } /** @@ -4408,21 +4018,20 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign * The source value must be present in the highest lane of the wave, and the * highest lane must be live. */ -void -ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { - if (ws->maxwaves <= 1) - return; + if (ws->maxwaves <= 1) + return; - const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false); - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMValueRef tmp; + const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false); + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMValueRef tmp; - tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, ""); - ac_build_ifcc(ctx, tmp, 1000); - LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, "")); - ac_build_endif(ctx, 1000); + tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, ""); + ac_build_ifcc(ctx, tmp, 1000); + LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, "")); + ac_build_endif(ctx, 1000); } /** @@ -4431,61 +4040,59 @@ ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) * * The caller must place a barrier between the top and bottom halves. */ -void -ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - const LLVMTypeRef type = LLVMTypeOf(ws->src); - const LLVMValueRef identity = - get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); - - if (ws->maxwaves <= 1) { - ws->result_reduce = ws->src; - ws->result_inclusive = ws->src; - ws->result_exclusive = identity; - return; - } - assert(ws->maxwaves <= 32); - - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tid = ac_get_thread_id(ctx); - LLVMBasicBlockRef bbs[2]; - LLVMValueRef phivalues_scan[2]; - LLVMValueRef tmp, tmp2; - - bbs[0] = LLVMGetInsertBlock(builder); - phivalues_scan[0] = LLVMGetUndef(type); - - if (ws->enable_reduce) - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); - else if (ws->enable_inclusive) - tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); - else - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); - ac_build_ifcc(ctx, tmp, 1001); - { - tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), ""); - - ac_build_optimization_barrier(ctx, &tmp); - - bbs[1] = LLVMGetInsertBlock(builder); - phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true); - } - ac_build_endif(ctx, 1001); - - const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); - - if (ws->enable_reduce) { - tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); - ws->result_reduce = ac_build_readlane(ctx, scan, tmp); - } - if (ws->enable_inclusive) - ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); - if (ws->enable_exclusive) { - tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); - tmp = ac_build_readlane(ctx, scan, tmp); - tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); - ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); - } +void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + const LLVMTypeRef type = LLVMTypeOf(ws->src); + const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type)); + + if (ws->maxwaves <= 1) { + ws->result_reduce = ws->src; + ws->result_inclusive = ws->src; + ws->result_exclusive = identity; + return; + } + assert(ws->maxwaves <= 32); + + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tid = ac_get_thread_id(ctx); + LLVMBasicBlockRef bbs[2]; + LLVMValueRef phivalues_scan[2]; + LLVMValueRef tmp, tmp2; + + bbs[0] = LLVMGetInsertBlock(builder); + phivalues_scan[0] = LLVMGetUndef(type); + + if (ws->enable_reduce) + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, ""); + else if (ws->enable_inclusive) + tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, ""); + else + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, ""); + ac_build_ifcc(ctx, tmp, 1001); + { + tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), ""); + + ac_build_optimization_barrier(ctx, &tmp); + + bbs[1] = LLVMGetInsertBlock(builder); + phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true); + } + ac_build_endif(ctx, 1001); + + const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs); + + if (ws->enable_reduce) { + tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, ""); + ws->result_reduce = ac_build_readlane(ctx, scan, tmp); + } + if (ws->enable_inclusive) + ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx); + if (ws->enable_exclusive) { + tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, ""); + tmp = ac_build_readlane(ctx, scan, tmp); + tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, ""); + ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, ""); + } } /** @@ -4497,12 +4104,11 @@ ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) * of the workgroup are live. (This requirement cannot easily be relaxed in a * useful manner because of the barrier in the algorithm.) */ -void -ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { - ac_build_wg_wavescan_top(ctx, ws); - ac_build_s_barrier(ctx); - ac_build_wg_wavescan_bottom(ctx, ws); + ac_build_wg_wavescan_top(ctx, ws); + ac_build_s_barrier(ctx); + ac_build_wg_wavescan_bottom(ctx, ws); } /** @@ -4511,25 +4117,24 @@ ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) * * All lanes must be active when this code runs. */ -void -ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - if (ws->enable_exclusive) { - ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); - if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) - ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); - ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); - } else { - ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); - } - - bool enable_inclusive = ws->enable_inclusive; - bool enable_exclusive = ws->enable_exclusive; - ws->enable_inclusive = false; - ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; - ac_build_wg_wavescan_top(ctx, ws); - ws->enable_inclusive = enable_inclusive; - ws->enable_exclusive = enable_exclusive; +void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + if (ws->enable_exclusive) { + ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op); + if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd) + ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, ""); + ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op); + } else { + ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op); + } + + bool enable_inclusive = ws->enable_inclusive; + bool enable_exclusive = ws->enable_exclusive; + ws->enable_inclusive = false; + ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; + ac_build_wg_wavescan_top(ctx, ws); + ws->enable_inclusive = enable_inclusive; + ws->enable_exclusive = enable_exclusive; } /** @@ -4538,22 +4143,21 @@ ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) * * The caller must place a barrier between the top and bottom halves. */ -void -ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) { - bool enable_inclusive = ws->enable_inclusive; - bool enable_exclusive = ws->enable_exclusive; - ws->enable_inclusive = false; - ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; - ac_build_wg_wavescan_bottom(ctx, ws); - ws->enable_inclusive = enable_inclusive; - ws->enable_exclusive = enable_exclusive; + bool enable_inclusive = ws->enable_inclusive; + bool enable_exclusive = ws->enable_exclusive; + ws->enable_inclusive = false; + ws->enable_exclusive = ws->enable_exclusive || enable_inclusive; + ac_build_wg_wavescan_bottom(ctx, ws); + ws->enable_inclusive = enable_inclusive; + ws->enable_exclusive = enable_exclusive; - /* ws->result_reduce is already the correct value */ - if (ws->enable_inclusive) - ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); - if (ws->enable_exclusive) - ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); + /* ws->result_reduce is already the correct value */ + if (ws->enable_inclusive) + ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op); + if (ws->enable_exclusive) + ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op); } /** @@ -4562,114 +4166,101 @@ ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) * The caller must ensure that all lanes are active when this code runs * (WWM is insufficient!), because there is an implied barrier. */ -void -ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) -{ - ac_build_wg_scan_top(ctx, ws); - ac_build_s_barrier(ctx); - ac_build_wg_scan_bottom(ctx, ws); -} - -LLVMValueRef -ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, - unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) -{ - unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); - if (ctx->chip_class >= GFX8) { - return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); - } else { - return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); - } -} - -LLVMValueRef -ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) -{ - LLVMTypeRef type = LLVMTypeOf(src); - LLVMValueRef result; - - index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); - src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); - - result = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, - (LLVMValueRef []) {index, src}, 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); - return LLVMBuildTrunc(ctx->builder, result, type, ""); -} - -LLVMValueRef -ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.frexp.exp.i16.f16"; - type = ctx->i16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.frexp.exp.i32.f32"; - type = ctx->i32; - } else { - intr = "llvm.amdgcn.frexp.exp.i32.f64"; - type = ctx->i32; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); -} -LLVMValueRef -ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.amdgcn.frexp.mant.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.amdgcn.frexp.mant.f32"; - type = ctx->f32; - } else { - intr = "llvm.amdgcn.frexp.mant.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); -} - -LLVMValueRef -ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) -{ - LLVMTypeRef type; - char *intr; - - if (bitsize == 16) { - intr = "llvm.canonicalize.f16"; - type = ctx->f16; - } else if (bitsize == 32) { - intr = "llvm.canonicalize.f32"; - type = ctx->f32; - } else { - intr = "llvm.canonicalize.f64"; - type = ctx->f64; - } - - LLVMValueRef params[] = { - src0, - }; - return ac_build_intrinsic(ctx, intr, type, params, 1, - AC_FUNC_ATTR_READNONE); +void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws) +{ + ac_build_wg_scan_top(ctx, ws); + ac_build_s_barrier(ctx); + ac_build_wg_scan_bottom(ctx, ws); +} + +LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0, + unsigned lane1, unsigned lane2, unsigned lane3) +{ + unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3); + if (ctx->chip_class >= GFX8) { + return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false); + } else { + return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask); + } +} + +LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef result; + + index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); + src = LLVMBuildZExt(ctx->builder, src, ctx->i32, ""); + + result = + ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + return LLVMBuildTrunc(ctx->builder, result, type, ""); +} + +LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.frexp.exp.i16.f16"; + type = ctx->i16; + } else if (bitsize == 32) { + intr = "llvm.amdgcn.frexp.exp.i32.f32"; + type = ctx->i32; + } else { + intr = "llvm.amdgcn.frexp.exp.i32.f64"; + type = ctx->i32; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); +} +LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.amdgcn.frexp.mant.f16"; + type = ctx->f16; + } else if (bitsize == 32) { + intr = "llvm.amdgcn.frexp.mant.f32"; + type = ctx->f32; + } else { + intr = "llvm.amdgcn.frexp.mant.f64"; + type = ctx->f64; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) +{ + LLVMTypeRef type; + char *intr; + + if (bitsize == 16) { + intr = "llvm.canonicalize.f16"; + type = ctx->f16; + } else if (bitsize == 32) { + intr = "llvm.canonicalize.f32"; + type = ctx->f32; + } else { + intr = "llvm.canonicalize.f64"; + type = ctx->f64; + } + + LLVMValueRef params[] = { + src0, + }; + return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE); } /* @@ -4677,124 +4268,111 @@ ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, * and works out the X and Y derivatives. * it returns DDX(I), DDX(J), DDY(I), DDY(J). */ -LLVMValueRef -ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) -{ - LLVMValueRef result[4], a; - unsigned i; - - for (i = 0; i < 2; i++) { - a = LLVMBuildExtractElement(ctx->builder, interp_ij, - LLVMConstInt(ctx->i32, i, false), ""); - result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); - result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); - } - return ac_build_gather_values(ctx, result, 4); -} - -LLVMValueRef -ac_build_load_helper_invocation(struct ac_llvm_context *ctx) -{ - LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", - ctx->i1, NULL, 0, - AC_FUNC_ATTR_READNONE); - result = LLVMBuildNot(ctx->builder, result, ""); - return LLVMBuildSExt(ctx->builder, result, ctx->i32, ""); -} - -LLVMValueRef -ac_build_is_helper_invocation(struct ac_llvm_context *ctx) -{ - if (!ctx->postponed_kill) - return ac_build_load_helper_invocation(ctx); - - /* !(exact && postponed) */ - LLVMValueRef exact = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", - ctx->i1, NULL, 0, - AC_FUNC_ATTR_READNONE); - - LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, ""); - LLVMValueRef result = LLVMBuildAnd(ctx->builder, exact, postponed, ""); - - return LLVMBuildSelect(ctx->builder, result, ctx->i32_0, - LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), ""); -} - -LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, - LLVMValueRef *args, unsigned num_args) -{ - LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, ""); - LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func)); - return ret; -} - -void -ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, - LLVMValueRef stencil, LLVMValueRef samplemask, - struct ac_export_args *args) -{ - unsigned mask = 0; - unsigned format = ac_get_spi_shader_z_format(depth != NULL, - stencil != NULL, - samplemask != NULL); - - assert(depth || stencil || samplemask); - - memset(args, 0, sizeof(*args)); - - args->valid_mask = 1; /* whether the EXEC mask is valid */ - args->done = 1; /* DONE bit */ - - /* Specify the target we are exporting */ - args->target = V_008DFC_SQ_EXP_MRTZ; - - args->compr = 0; /* COMP flag */ - args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */ - args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ - args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */ - args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ - - if (format == V_028710_SPI_SHADER_UINT16_ABGR) { - assert(!depth); - args->compr = 1; /* COMPR flag */ - - if (stencil) { - /* Stencil should be in X[23:16]. */ - stencil = ac_to_integer(ctx, stencil); - stencil = LLVMBuildShl(ctx->builder, stencil, - LLVMConstInt(ctx->i32, 16, 0), ""); - args->out[0] = ac_to_float(ctx, stencil); - mask |= 0x3; - } - if (samplemask) { - /* SampleMask should be in Y[15:0]. */ - args->out[1] = samplemask; - mask |= 0xc; - } - } else { - if (depth) { - args->out[0] = depth; - mask |= 0x1; - } - if (stencil) { - args->out[1] = stencil; - mask |= 0x2; - } - if (samplemask) { - args->out[2] = samplemask; - mask |= 0x4; - } - } - - /* GFX6 (except OLAND and HAINAN) has a bug that it only looks - * at the X writemask component. */ - if (ctx->chip_class == GFX6 && - ctx->family != CHIP_OLAND && - ctx->family != CHIP_HAINAN) - mask |= 0x1; - - /* Specify which components to enable */ - args->enabled_channels = mask; +LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij) +{ + LLVMValueRef result[4], a; + unsigned i; + + for (i = 0; i < 2; i++) { + a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), ""); + result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a); + result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a); + } + return ac_build_gather_values(ctx, result, 4); +} + +LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx) +{ + LLVMValueRef result = + ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE); + result = LLVMBuildNot(ctx->builder, result, ""); + return LLVMBuildSExt(ctx->builder, result, ctx->i32, ""); +} + +LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx) +{ + if (!ctx->postponed_kill) + return ac_build_load_helper_invocation(ctx); + + /* !(exact && postponed) */ + LLVMValueRef exact = + ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE); + + LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, ""); + LLVMValueRef result = LLVMBuildAnd(ctx->builder, exact, postponed, ""); + + return LLVMBuildSelect(ctx->builder, result, ctx->i32_0, + LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), ""); +} + +LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args, + unsigned num_args) +{ + LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, ""); + LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func)); + return ret; +} + +void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil, + LLVMValueRef samplemask, struct ac_export_args *args) +{ + unsigned mask = 0; + unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL); + + assert(depth || stencil || samplemask); + + memset(args, 0, sizeof(*args)); + + args->valid_mask = 1; /* whether the EXEC mask is valid */ + args->done = 1; /* DONE bit */ + + /* Specify the target we are exporting */ + args->target = V_008DFC_SQ_EXP_MRTZ; + + args->compr = 0; /* COMP flag */ + args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */ + args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */ + args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */ + args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */ + + if (format == V_028710_SPI_SHADER_UINT16_ABGR) { + assert(!depth); + args->compr = 1; /* COMPR flag */ + + if (stencil) { + /* Stencil should be in X[23:16]. */ + stencil = ac_to_integer(ctx, stencil); + stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), ""); + args->out[0] = ac_to_float(ctx, stencil); + mask |= 0x3; + } + if (samplemask) { + /* SampleMask should be in Y[15:0]. */ + args->out[1] = samplemask; + mask |= 0xc; + } + } else { + if (depth) { + args->out[0] = depth; + mask |= 0x1; + } + if (stencil) { + args->out[1] = stencil; + mask |= 0x2; + } + if (samplemask) { + args->out[2] = samplemask; + mask |= 0x4; + } + } + + /* GFX6 (except OLAND and HAINAN) has a bug that it only looks + * at the X writemask component. */ + if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN) + mask |= 0x1; + + /* Specify which components to enable */ + args->enabled_channels = mask; } /* Send GS Alloc Req message from the first wave of the group to SPI. @@ -4803,217 +4381,204 @@ ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, * - bits 12..22: primitives in group */ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, - LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) -{ - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tmp; - bool export_dummy_prim = false; - - /* HW workaround for a GPU hang with 100% culling. - * We always have to export at least 1 primitive. - * Export a degenerate triangle using vertex 0 for all 3 vertices. - */ - if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) { - assert(vtx_cnt == ctx->i32_0); - prim_cnt = ctx->i32_1; - vtx_cnt = ctx->i32_1; - export_dummy_prim = true; - } - - ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); - - tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false),""); - tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); - ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); - - if (export_dummy_prim) { - struct ac_ngg_prim prim = {}; - /* The vertex indices are 0,0,0. */ - prim.passthrough = ctx->i32_0; - - struct ac_export_args pos = {}; - pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0; - pos.target = V_008DFC_SQ_EXP_POS; - pos.enabled_channels = 0xf; - pos.done = true; - - ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), - ctx->i32_0, ""), 5021); - ac_build_export_prim(ctx, &prim); - ac_build_export(ctx, &pos); - ac_build_endif(ctx, 5021); - } - - ac_build_endif(ctx, 5020); -} - -LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, - const struct ac_ngg_prim *prim) -{ - /* The prim export format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) - */ - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); - LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); - - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildShl(builder, prim->index[i], - LLVMConstInt(ctx->i32, 10 * i, false), ""); - result = LLVMBuildOr(builder, result, tmp, ""); - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, ""); - tmp = LLVMBuildShl(builder, tmp, - LLVMConstInt(ctx->i32, 10 * i + 9, false), ""); - result = LLVMBuildOr(builder, result, tmp, ""); - } - return result; -} - -void ac_build_export_prim(struct ac_llvm_context *ctx, - const struct ac_ngg_prim *prim) -{ - struct ac_export_args args; - - if (prim->passthrough) { - args.out[0] = prim->passthrough; - } else { - args.out[0] = ac_pack_prim_export(ctx, prim); - } - - args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); - args.out[1] = LLVMGetUndef(ctx->f32); - args.out[2] = LLVMGetUndef(ctx->f32); - args.out[3] = LLVMGetUndef(ctx->f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; - - ac_build_export(ctx, &args); -} - -static LLVMTypeRef -arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) -{ - if (type == AC_ARG_FLOAT) { - return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size); - } else if (type == AC_ARG_INT) { - return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size); - } else { - LLVMTypeRef ptr_type; - switch (type) { - case AC_ARG_CONST_PTR: - ptr_type = ctx->i8; - break; - case AC_ARG_CONST_FLOAT_PTR: - ptr_type = ctx->f32; - break; - case AC_ARG_CONST_PTR_PTR: - ptr_type = ac_array_in_const32_addr_space(ctx->i8); - break; - case AC_ARG_CONST_DESC_PTR: - ptr_type = ctx->v4i32; - break; - case AC_ARG_CONST_IMAGE_PTR: - ptr_type = ctx->v8i32; - break; - default: - unreachable("unknown arg type"); - } - if (size == 1) { - return ac_array_in_const32_addr_space(ptr_type); - } else { - assert(size == 2); - return ac_array_in_const_addr_space(ptr_type); - } - } -} - -LLVMValueRef -ac_build_main(const struct ac_shader_args *args, - struct ac_llvm_context *ctx, - enum ac_llvm_calling_convention convention, - const char *name, LLVMTypeRef ret_type, - LLVMModuleRef module) -{ - LLVMTypeRef arg_types[AC_MAX_ARGS]; - - for (unsigned i = 0; i < args->arg_count; i++) { - arg_types[i] = arg_llvm_type(args->args[i].type, - args->args[i].size, ctx); - } - - LLVMTypeRef main_function_type = - LLVMFunctionType(ret_type, arg_types, args->arg_count, 0); - - LLVMValueRef main_function = - LLVMAddFunction(module, name, main_function_type); - LLVMBasicBlockRef main_function_body = - LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body"); - LLVMPositionBuilderAtEnd(ctx->builder, main_function_body); - - LLVMSetFunctionCallConv(main_function, convention); - for (unsigned i = 0; i < args->arg_count; ++i) { - LLVMValueRef P = LLVMGetParam(main_function, i); - - if (args->args[i].file != AC_ARG_SGPR) - continue; - - ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG); - - if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { - ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); - ac_add_attr_dereferenceable(P, UINT64_MAX); - ac_add_attr_alignment(P, 32); - } - } - - ctx->main_function = main_function; - - if (LLVM_VERSION_MAJOR >= 11) { - /* Enable denormals for FP16 and FP64: */ - LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", - "ieee,ieee"); - /* Disable denormals for FP32: */ - LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32", - "preserve-sign,preserve-sign"); - } - return main_function; + LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tmp; + bool export_dummy_prim = false; + + /* HW workaround for a GPU hang with 100% culling. + * We always have to export at least 1 primitive. + * Export a degenerate triangle using vertex 0 for all 3 vertices. + */ + if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) { + assert(vtx_cnt == ctx->i32_0); + prim_cnt = ctx->i32_1; + vtx_cnt = ctx->i32_1; + export_dummy_prim = true; + } + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020); + + tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), ""); + tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); + ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp); + + if (export_dummy_prim) { + struct ac_ngg_prim prim = {}; + /* The vertex indices are 0,0,0. */ + prim.passthrough = ctx->i32_0; + + struct ac_export_args pos = {}; + pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0; + pos.target = V_008DFC_SQ_EXP_POS; + pos.enabled_channels = 0xf; + pos.done = true; + + ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""), + 5021); + ac_build_export_prim(ctx, &prim); + ac_build_export(ctx, &pos); + ac_build_endif(ctx, 5021); + } + + ac_build_endif(ctx, 5020); +} + +LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) +{ + /* The prim export format is: + * - bits 0..8: index 0 + * - bit 9: edge flag 0 + * - bits 10..18: index 1 + * - bit 19: edge flag 1 + * - bits 20..28: index 2 + * - bit 29: edge flag 2 + * - bit 31: null primitive (skip) + */ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, ""); + LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), ""); + + for (unsigned i = 0; i < prim->num_vertices; ++i) { + tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), ""); + result = LLVMBuildOr(builder, result, tmp, ""); + tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, ""); + tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), ""); + result = LLVMBuildOr(builder, result, tmp, ""); + } + return result; +} + +void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim) +{ + struct ac_export_args args; + + if (prim->passthrough) { + args.out[0] = prim->passthrough; + } else { + args.out[0] = ac_pack_prim_export(ctx, prim); + } + + args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, ""); + args.out[1] = LLVMGetUndef(ctx->f32); + args.out[2] = LLVMGetUndef(ctx->f32); + args.out[3] = LLVMGetUndef(ctx->f32); + + args.target = V_008DFC_SQ_EXP_PRIM; + args.enabled_channels = 1; + args.done = true; + args.valid_mask = false; + args.compr = false; + + ac_build_export(ctx, &args); +} + +static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx) +{ + if (type == AC_ARG_FLOAT) { + return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size); + } else if (type == AC_ARG_INT) { + return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size); + } else { + LLVMTypeRef ptr_type; + switch (type) { + case AC_ARG_CONST_PTR: + ptr_type = ctx->i8; + break; + case AC_ARG_CONST_FLOAT_PTR: + ptr_type = ctx->f32; + break; + case AC_ARG_CONST_PTR_PTR: + ptr_type = ac_array_in_const32_addr_space(ctx->i8); + break; + case AC_ARG_CONST_DESC_PTR: + ptr_type = ctx->v4i32; + break; + case AC_ARG_CONST_IMAGE_PTR: + ptr_type = ctx->v8i32; + break; + default: + unreachable("unknown arg type"); + } + if (size == 1) { + return ac_array_in_const32_addr_space(ptr_type); + } else { + assert(size == 2); + return ac_array_in_const_addr_space(ptr_type); + } + } +} + +LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx, + enum ac_llvm_calling_convention convention, const char *name, + LLVMTypeRef ret_type, LLVMModuleRef module) +{ + LLVMTypeRef arg_types[AC_MAX_ARGS]; + + for (unsigned i = 0; i < args->arg_count; i++) { + arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx); + } + + LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0); + + LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type); + LLVMBasicBlockRef main_function_body = + LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body"); + LLVMPositionBuilderAtEnd(ctx->builder, main_function_body); + + LLVMSetFunctionCallConv(main_function, convention); + for (unsigned i = 0; i < args->arg_count; ++i) { + LLVMValueRef P = LLVMGetParam(main_function, i); + + if (args->args[i].file != AC_ARG_SGPR) + continue; + + ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG); + + if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { + ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); + ac_add_attr_dereferenceable(P, UINT64_MAX); + ac_add_attr_alignment(P, 32); + } + } + + ctx->main_function = main_function; + + if (LLVM_VERSION_MAJOR >= 11) { + /* Enable denormals for FP16 and FP64: */ + LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee"); + /* Disable denormals for FP32: */ + LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32", + "preserve-sign,preserve-sign"); + } + return main_function; } void ac_build_s_endpgm(struct ac_llvm_context *ctx) { - LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false); - LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false); - LLVMBuildCall(ctx->builder, code, NULL, 0, ""); + LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false); + LLVMBuildCall(ctx->builder, code, NULL, 0, ""); } -LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, - LLVMValueRef mask, LLVMValueRef index) +LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index) { - LLVMBuilderRef builder = ctx->builder; - LLVMTypeRef type = LLVMTypeOf(mask); + LLVMBuilderRef builder = ctx->builder; + LLVMTypeRef type = LLVMTypeOf(mask); - LLVMValueRef bit = LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), - LLVMBuildZExt(builder, index, type, ""), ""); - LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), ""); - LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, ""); - return ac_build_bit_count(ctx, prefix_mask); + LLVMValueRef bit = + LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), LLVMBuildZExt(builder, index, type, ""), ""); + LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), ""); + LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, ""); + return ac_build_bit_count(ctx, prefix_mask); } /* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */ -LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, - LLVMValueRef mask[2], LLVMValueRef index) +LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2], + LLVMValueRef index) { - LLVMBuilderRef builder = ctx->builder; + LLVMBuilderRef builder = ctx->builder; #if 0 /* Reference version using i128. */ LLVMValueRef input_mask = @@ -5021,37 +4586,37 @@ LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, return ac_prefix_bitcount(ctx, input_mask, index); #else - /* Optimized version using 2 64-bit masks. */ - LLVMValueRef is_hi, is_0, c64, c128, all_bits; - LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2]; - - /* Compute the 128-bit prefix mask. */ - c64 = LLVMConstInt(ctx->i32, 64, 0); - c128 = LLVMConstInt(ctx->i32, 128, 0); - all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0); - /* The first index that can have non-zero high bits in the prefix mask is 65. */ - is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, ""); - is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, ""); - mask_bcnt0 = ac_build_bit_count(ctx, mask[0]); - - for (unsigned i = 0; i < 2; i++) { - shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, ""); - /* For i==0, index==0, the right shift by 64 doesn't give the desired result, - * so we handle it by the is_0 select. - * For i==1, index==64, same story, so we handle it by the last is_hi select. - * For i==0, index==64, we shift by 0, which is what we want. - */ - prefix_mask[i] = LLVMBuildLShr(builder, all_bits, - LLVMBuildZExt(builder, shift[i], ctx->i64, ""), ""); - prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], ""); - prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]); - } - - prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], ""); - prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], ""); - prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, ""); - - return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], ""); + /* Optimized version using 2 64-bit masks. */ + LLVMValueRef is_hi, is_0, c64, c128, all_bits; + LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2]; + + /* Compute the 128-bit prefix mask. */ + c64 = LLVMConstInt(ctx->i32, 64, 0); + c128 = LLVMConstInt(ctx->i32, 128, 0); + all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0); + /* The first index that can have non-zero high bits in the prefix mask is 65. */ + is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, ""); + is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, ""); + mask_bcnt0 = ac_build_bit_count(ctx, mask[0]); + + for (unsigned i = 0; i < 2; i++) { + shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, ""); + /* For i==0, index==0, the right shift by 64 doesn't give the desired result, + * so we handle it by the is_0 select. + * For i==1, index==64, same story, so we handle it by the last is_hi select. + * For i==0, index==64, we shift by 0, which is what we want. + */ + prefix_mask[i] = + LLVMBuildLShr(builder, all_bits, LLVMBuildZExt(builder, shift[i], ctx->i64, ""), ""); + prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], ""); + prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]); + } + + prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], ""); + prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], ""); + prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, ""); + + return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], ""); #endif } @@ -5059,33 +4624,26 @@ LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, * Convert triangle strip indices to triangle indices. This is used to decompose * triangle strips into triangles. */ -void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, - LLVMValueRef is_odd, - LLVMValueRef flatshade_first, - LLVMValueRef index[3]) -{ - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef out[3]; - - /* We need to change the vertex order for odd triangles to get correct - * front/back facing by swapping 2 vertex indices, but we also have to - * keep the provoking vertex in the same place. - * - * If the first vertex is provoking, swap index 1 and 2. - * If the last vertex is provoking, swap index 0 and 1. - */ - out[0] = LLVMBuildSelect(builder, flatshade_first, - index[0], - LLVMBuildSelect(builder, is_odd, - index[1], index[0], ""), ""); - out[1] = LLVMBuildSelect(builder, flatshade_first, - LLVMBuildSelect(builder, is_odd, - index[2], index[1], ""), - LLVMBuildSelect(builder, is_odd, - index[0], index[1], ""), ""); - out[2] = LLVMBuildSelect(builder, flatshade_first, - LLVMBuildSelect(builder, is_odd, - index[1], index[2], ""), - index[2], ""); - memcpy(index, out, sizeof(out)); +void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd, + LLVMValueRef flatshade_first, + LLVMValueRef index[3]) +{ + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef out[3]; + + /* We need to change the vertex order for odd triangles to get correct + * front/back facing by swapping 2 vertex indices, but we also have to + * keep the provoking vertex in the same place. + * + * If the first vertex is provoking, swap index 1 and 2. + * If the last vertex is provoking, swap index 0 and 1. + */ + out[0] = LLVMBuildSelect(builder, flatshade_first, index[0], + LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), ""); + out[1] = LLVMBuildSelect(builder, flatshade_first, + LLVMBuildSelect(builder, is_odd, index[2], index[1], ""), + LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), ""); + out[2] = LLVMBuildSelect(builder, flatshade_first, + LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], ""); + memcpy(index, out, sizeof(out)); } diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index a009560841d..756bbebd8f5 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -25,140 +25,134 @@ #ifndef AC_LLVM_BUILD_H #define AC_LLVM_BUILD_H -#include -#include -#include "compiler/nir/nir.h" -#include "amd_family.h" -#include "ac_shader_util.h" -#include "ac_shader_args.h" #include "ac_shader_abi.h" +#include "ac_shader_args.h" +#include "ac_shader_util.h" +#include "amd_family.h" +#include "compiler/nir/nir.h" +#include + +#include #ifdef __cplusplus extern "C" { #endif -enum { - AC_ADDR_SPACE_FLAT = 0, /* Slower than global. */ - AC_ADDR_SPACE_GLOBAL = 1, - AC_ADDR_SPACE_GDS = 2, - AC_ADDR_SPACE_LDS = 3, - AC_ADDR_SPACE_CONST = 4, /* Global allowing SMEM. */ - AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */ +enum +{ + AC_ADDR_SPACE_FLAT = 0, /* Slower than global. */ + AC_ADDR_SPACE_GLOBAL = 1, + AC_ADDR_SPACE_GDS = 2, + AC_ADDR_SPACE_LDS = 3, + AC_ADDR_SPACE_CONST = 4, /* Global allowing SMEM. */ + AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */ }; -#define AC_WAIT_LGKM (1 << 0) /* LDS, GDS, constant, message */ -#define AC_WAIT_VLOAD (1 << 1) /* VMEM load/sample instructions */ -#define AC_WAIT_VSTORE (1 << 2) /* VMEM store instructions */ +#define AC_WAIT_LGKM (1 << 0) /* LDS, GDS, constant, message */ +#define AC_WAIT_VLOAD (1 << 1) /* VMEM load/sample instructions */ +#define AC_WAIT_VSTORE (1 << 2) /* VMEM store instructions */ struct ac_llvm_flow; struct ac_llvm_compiler; enum ac_float_mode; struct ac_llvm_flow_state { - struct ac_llvm_flow *stack; - unsigned depth_max; - unsigned depth; + struct ac_llvm_flow *stack; + unsigned depth_max; + unsigned depth; }; struct ac_llvm_context { - LLVMContextRef context; - LLVMModuleRef module; - LLVMBuilderRef builder; - - LLVMValueRef main_function; - - LLVMTypeRef voidt; - LLVMTypeRef i1; - LLVMTypeRef i8; - LLVMTypeRef i16; - LLVMTypeRef i32; - LLVMTypeRef i64; - LLVMTypeRef i128; - LLVMTypeRef intptr; - LLVMTypeRef f16; - LLVMTypeRef f32; - LLVMTypeRef f64; - LLVMTypeRef v2i16; - LLVMTypeRef v4i16; - LLVMTypeRef v2f16; - LLVMTypeRef v4f16; - LLVMTypeRef v2i32; - LLVMTypeRef v3i32; - LLVMTypeRef v4i32; - LLVMTypeRef v2f32; - LLVMTypeRef v3f32; - LLVMTypeRef v4f32; - LLVMTypeRef v8i32; - LLVMTypeRef iN_wavemask; - LLVMTypeRef iN_ballotmask; - - LLVMValueRef i8_0; - LLVMValueRef i8_1; - LLVMValueRef i16_0; - LLVMValueRef i16_1; - LLVMValueRef i32_0; - LLVMValueRef i32_1; - LLVMValueRef i64_0; - LLVMValueRef i64_1; - LLVMValueRef i128_0; - LLVMValueRef i128_1; - LLVMValueRef f16_0; - LLVMValueRef f16_1; - LLVMValueRef f32_0; - LLVMValueRef f32_1; - LLVMValueRef f64_0; - LLVMValueRef f64_1; - LLVMValueRef i1true; - LLVMValueRef i1false; - - /* Temporary helper to implement demote_to_helper: - * True = live lanes - * False = demoted lanes - */ - LLVMValueRef postponed_kill; - - /* Since ac_nir_translate makes a local copy of ac_llvm_context, there - * are two ac_llvm_contexts. Declare a pointer here, so that the control - * flow stack is shared by both ac_llvm_contexts. - */ - struct ac_llvm_flow_state *flow; - - unsigned range_md_kind; - unsigned invariant_load_md_kind; - unsigned uniform_md_kind; - LLVMValueRef empty_md; - - enum chip_class chip_class; - enum radeon_family family; - - unsigned wave_size; - unsigned ballot_mask_bits; - - unsigned float_mode; - - LLVMValueRef lds; + LLVMContextRef context; + LLVMModuleRef module; + LLVMBuilderRef builder; + + LLVMValueRef main_function; + + LLVMTypeRef voidt; + LLVMTypeRef i1; + LLVMTypeRef i8; + LLVMTypeRef i16; + LLVMTypeRef i32; + LLVMTypeRef i64; + LLVMTypeRef i128; + LLVMTypeRef intptr; + LLVMTypeRef f16; + LLVMTypeRef f32; + LLVMTypeRef f64; + LLVMTypeRef v2i16; + LLVMTypeRef v4i16; + LLVMTypeRef v2f16; + LLVMTypeRef v4f16; + LLVMTypeRef v2i32; + LLVMTypeRef v3i32; + LLVMTypeRef v4i32; + LLVMTypeRef v2f32; + LLVMTypeRef v3f32; + LLVMTypeRef v4f32; + LLVMTypeRef v8i32; + LLVMTypeRef iN_wavemask; + LLVMTypeRef iN_ballotmask; + + LLVMValueRef i8_0; + LLVMValueRef i8_1; + LLVMValueRef i16_0; + LLVMValueRef i16_1; + LLVMValueRef i32_0; + LLVMValueRef i32_1; + LLVMValueRef i64_0; + LLVMValueRef i64_1; + LLVMValueRef i128_0; + LLVMValueRef i128_1; + LLVMValueRef f16_0; + LLVMValueRef f16_1; + LLVMValueRef f32_0; + LLVMValueRef f32_1; + LLVMValueRef f64_0; + LLVMValueRef f64_1; + LLVMValueRef i1true; + LLVMValueRef i1false; + + /* Temporary helper to implement demote_to_helper: + * True = live lanes + * False = demoted lanes + */ + LLVMValueRef postponed_kill; + + /* Since ac_nir_translate makes a local copy of ac_llvm_context, there + * are two ac_llvm_contexts. Declare a pointer here, so that the control + * flow stack is shared by both ac_llvm_contexts. + */ + struct ac_llvm_flow_state *flow; + + unsigned range_md_kind; + unsigned invariant_load_md_kind; + unsigned uniform_md_kind; + LLVMValueRef empty_md; + + enum chip_class chip_class; + enum radeon_family family; + + unsigned wave_size; + unsigned ballot_mask_bits; + + unsigned float_mode; + + LLVMValueRef lds; }; -void -ac_llvm_context_init(struct ac_llvm_context *ctx, - struct ac_llvm_compiler *compiler, - enum chip_class chip_class, enum radeon_family family, - enum ac_float_mode float_mode, unsigned wave_size, - unsigned ballot_mask_bits); +void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler, + enum chip_class chip_class, enum radeon_family family, + enum ac_float_mode float_mode, unsigned wave_size, + unsigned ballot_mask_bits); -void -ac_llvm_context_dispose(struct ac_llvm_context *ctx); +void ac_llvm_context_dispose(struct ac_llvm_context *ctx); -int -ac_get_llvm_num_components(LLVMValueRef value); +int ac_get_llvm_num_components(LLVMValueRef value); -int -ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type); +int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type); -LLVMValueRef -ac_llvm_extract_elem(struct ac_llvm_context *ac, - LLVMValueRef value, - int index); +LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index); unsigned ac_get_type_size(LLVMTypeRef type); @@ -168,28 +162,22 @@ LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t); LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v); -LLVMValueRef -ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, - LLVMTypeRef return_type, LLVMValueRef *params, - unsigned param_count, unsigned attrib_mask); +LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name, + LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count, + unsigned attrib_mask); void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize); -LLVMValueRef -ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, - unsigned count_incoming, LLVMValueRef *values, - LLVMBasicBlockRef *blocks); +LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming, + LLVMValueRef *values, LLVMBasicBlockRef *blocks); void ac_build_s_barrier(struct ac_llvm_context *ctx); -void ac_build_optimization_barrier(struct ac_llvm_context *ctx, - LLVMValueRef *pvgpr); +void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr); -LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, - nir_scope scope); +LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope); LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value); -LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, - LLVMValueRef value); +LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value); LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value); @@ -197,276 +185,153 @@ LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value); LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value); -LLVMValueRef -ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, - unsigned value_count, unsigned component); - -LLVMValueRef -ac_build_gather_values_extended(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count, - unsigned value_stride, - bool load, - bool always_vector); -LLVMValueRef -ac_build_gather_values(struct ac_llvm_context *ctx, - LLVMValueRef *values, - unsigned value_count); - -LLVMValueRef -ac_extract_components(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned start, - unsigned channels); - -LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, - LLVMValueRef value, - unsigned num_channels); +LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count, unsigned component); + +LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count, unsigned value_stride, bool load, + bool always_vector); +LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, + unsigned value_count); + +LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start, + unsigned channels); + +LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value, + unsigned num_channels); LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value); -LLVMValueRef -ac_build_fdiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef den); - -LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment); -LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef pre_shift, - LLVMValueRef post_shift, - LLVMValueRef increment); -LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, - LLVMValueRef num, - LLVMValueRef multiplier, - LLVMValueRef post_shift); - -void -ac_prepare_cube_coords(struct ac_llvm_context *ctx, - bool is_deriv, bool is_array, bool is_lod, - LLVMValueRef *coords_arg, - LLVMValueRef *derivs_arg); - - -LLVMValueRef -ac_build_fs_interp(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j); - -LLVMValueRef -ac_build_fs_interp_f16(struct ac_llvm_context *ctx, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params, - LLVMValueRef i, - LLVMValueRef j); - -LLVMValueRef -ac_build_fs_interp_mov(struct ac_llvm_context *ctx, - LLVMValueRef parameter, - LLVMValueRef llvm_chan, - LLVMValueRef attr_number, - LLVMValueRef params); - -LLVMValueRef -ac_build_gep_ptr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index); - -LLVMValueRef -ac_build_gep0(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, - LLVMValueRef index); +LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den); + +LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num, + LLVMValueRef multiplier, LLVMValueRef pre_shift, + LLVMValueRef post_shift, LLVMValueRef increment); +LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num, + LLVMValueRef multiplier, LLVMValueRef pre_shift, + LLVMValueRef post_shift, LLVMValueRef increment); +LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num, + LLVMValueRef multiplier, LLVMValueRef post_shift); + +void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod, + LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg); + +LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, + LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, + LLVMValueRef j); + +LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan, + LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i, + LLVMValueRef j); + +LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, + LLVMValueRef llvm_chan, LLVMValueRef attr_number, + LLVMValueRef params); + +LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index); + +LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index); LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef index); - -void -ac_build_indexed_store(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - LLVMValueRef value); - -LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index); -LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index); -LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index); + LLVMValueRef index); + +void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, + LLVMValueRef value); + +LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index); +LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index); +LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, + LLVMValueRef index); LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index); - -void -ac_build_buffer_store_dword(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy); - -void -ac_build_buffer_store_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef data, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned cache_policy); - -LLVMValueRef -ac_build_buffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - int num_channels, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned cache_policy, - bool can_speculate, - bool allow_smem); - -LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - unsigned num_channels, - unsigned cache_policy, - bool can_speculate, - bool d16); - -LLVMValueRef -ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy); - -LLVMValueRef -ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned cache_policy); - -LLVMValueRef -ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate); - -LLVMValueRef -ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy, - bool can_speculate); + LLVMValueRef base_ptr, LLVMValueRef index); + +void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, + unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset, + unsigned inst_offset, unsigned cache_policy); + +void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data, + LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy); + +LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels, + LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset, + unsigned inst_offset, unsigned cache_policy, bool can_speculate, + bool allow_smem); + +LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + unsigned num_channels, unsigned cache_policy, + bool can_speculate, bool d16); + +LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef voffset, LLVMValueRef soffset, + LLVMValueRef immoffset, unsigned cache_policy); + +LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef voffset, LLVMValueRef soffset, + LLVMValueRef immoffset, unsigned cache_policy); + +LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy, bool can_speculate); + +LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef voffset, LLVMValueRef soffset, + LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt, + unsigned nfmt, unsigned cache_policy, bool can_speculate); /* For ac_build_fetch_format. * * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi). */ -enum { - AC_FETCH_FORMAT_FLOAT = 0, - AC_FETCH_FORMAT_FIXED, - AC_FETCH_FORMAT_UNORM, - AC_FETCH_FORMAT_SNORM, - AC_FETCH_FORMAT_USCALED, - AC_FETCH_FORMAT_SSCALED, - AC_FETCH_FORMAT_UINT, - AC_FETCH_FORMAT_SINT, +enum +{ + AC_FETCH_FORMAT_FLOAT = 0, + AC_FETCH_FORMAT_FIXED, + AC_FETCH_FORMAT_UNORM, + AC_FETCH_FORMAT_SNORM, + AC_FETCH_FORMAT_USCALED, + AC_FETCH_FORMAT_SSCALED, + AC_FETCH_FORMAT_UINT, + AC_FETCH_FORMAT_SINT, }; -LLVMValueRef -ac_build_opencoded_load_format(struct ac_llvm_context *ctx, - unsigned log_size, - unsigned num_channels, - unsigned format, - bool reverse, - bool known_aligned, - LLVMValueRef rsrc, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy, - bool can_speculate); - -void -ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy); - -void -ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned cache_policy); - -void -ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy); - -void -ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - LLVMValueRef voffset, - LLVMValueRef soffset, - LLVMValueRef immoffset, - unsigned num_channels, - unsigned dfmt, - unsigned nfmt, - unsigned cache_policy); - -LLVMValueRef -ac_get_thread_id(struct ac_llvm_context *ctx); +LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size, + unsigned num_channels, unsigned format, bool reverse, + bool known_aligned, LLVMValueRef rsrc, + LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, unsigned cache_policy, + bool can_speculate); + +void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset, + unsigned cache_policy); + +void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, + LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy); + +void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, + LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset, + LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy); + +void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, + LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset, + unsigned num_channels, unsigned dfmt, unsigned nfmt, + unsigned cache_policy); + +LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx); #define AC_TID_MASK_TOP_LEFT 0xfffffffc #define AC_TID_MASK_TOP 0xfffffffd #define AC_TID_MASK_LEFT 0xfffffffe -LLVMValueRef -ac_build_ddxy(struct ac_llvm_context *ctx, - uint32_t mask, - int idx, - LLVMValueRef val); +LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val); -#define AC_SENDMSG_GS 2 -#define AC_SENDMSG_GS_DONE 3 +#define AC_SENDMSG_GS 2 +#define AC_SENDMSG_GS_DONE 3 #define AC_SENDMSG_GS_ALLOC_REQ 9 #define AC_SENDMSG_GS_OP_NOP (0 << 4) @@ -474,154 +339,132 @@ ac_build_ddxy(struct ac_llvm_context *ctx, #define AC_SENDMSG_GS_OP_EMIT (2 << 4) #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) -void ac_build_sendmsg(struct ac_llvm_context *ctx, - uint32_t msg, - LLVMValueRef wave_id); - -LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type); - -LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, - LLVMValueRef arg, - LLVMTypeRef dst_type); -LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); -LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, - LLVMValueRef b); +void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id); + +LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type); + +LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type); +LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); +LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); +LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); +LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b); LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value); struct ac_export_args { - LLVMValueRef out[4]; - unsigned target; - unsigned enabled_channels; - bool compr; - bool done; - bool valid_mask; + LLVMValueRef out[4]; + unsigned target; + unsigned enabled_channels; + bool compr; + bool done; + bool valid_mask; }; void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a); void ac_build_export_null(struct ac_llvm_context *ctx); -enum ac_image_opcode { - ac_image_sample, - ac_image_gather4, - ac_image_load, - ac_image_load_mip, - ac_image_store, - ac_image_store_mip, - ac_image_get_lod, - ac_image_get_resinfo, - ac_image_atomic, - ac_image_atomic_cmpswap, +enum ac_image_opcode +{ + ac_image_sample, + ac_image_gather4, + ac_image_load, + ac_image_load_mip, + ac_image_store, + ac_image_store_mip, + ac_image_get_lod, + ac_image_get_resinfo, + ac_image_atomic, + ac_image_atomic_cmpswap, }; -enum ac_atomic_op { - ac_atomic_swap, - ac_atomic_add, - ac_atomic_sub, - ac_atomic_smin, - ac_atomic_umin, - ac_atomic_smax, - ac_atomic_umax, - ac_atomic_and, - ac_atomic_or, - ac_atomic_xor, - ac_atomic_inc_wrap, - ac_atomic_dec_wrap, +enum ac_atomic_op +{ + ac_atomic_swap, + ac_atomic_add, + ac_atomic_sub, + ac_atomic_smin, + ac_atomic_umin, + ac_atomic_smax, + ac_atomic_umax, + ac_atomic_and, + ac_atomic_or, + ac_atomic_xor, + ac_atomic_inc_wrap, + ac_atomic_dec_wrap, }; /* These cache policy bits match the definitions used by the LLVM intrinsics. */ -enum ac_image_cache_policy { - ac_glc = 1 << 0, /* per-CU cache control */ - ac_slc = 1 << 1, /* global L2 cache control */ - ac_dlc = 1 << 2, /* per-shader-array cache control */ - ac_swizzled = 1 << 3, /* the access is swizzled, disabling load/store merging */ +enum ac_image_cache_policy +{ + ac_glc = 1 << 0, /* per-CU cache control */ + ac_slc = 1 << 1, /* global L2 cache control */ + ac_dlc = 1 << 2, /* per-shader-array cache control */ + ac_swizzled = 1 << 3, /* the access is swizzled, disabling load/store merging */ }; struct ac_image_args { - enum ac_image_opcode opcode : 4; - enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */ - enum ac_image_dim dim : 3; - unsigned dmask : 4; - unsigned cache_policy : 3; - bool unorm : 1; - bool level_zero : 1; - bool d16 : 1; /* data and return values are 16-bit, requires GFX8+ */ - unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */ - - LLVMValueRef resource; - LLVMValueRef sampler; - LLVMValueRef data[2]; /* data[0] is source data (vector); data[1] is cmp for cmpswap */ - LLVMValueRef offset; - LLVMValueRef bias; - LLVMValueRef compare; - LLVMValueRef derivs[6]; - LLVMValueRef coords[4]; - LLVMValueRef lod; // also used by ac_image_get_resinfo - LLVMValueRef min_lod; + enum ac_image_opcode opcode : 4; + enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */ + enum ac_image_dim dim : 3; + unsigned dmask : 4; + unsigned cache_policy : 3; + bool unorm : 1; + bool level_zero : 1; + bool d16 : 1; /* data and return values are 16-bit, requires GFX8+ */ + unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */ + + LLVMValueRef resource; + LLVMValueRef sampler; + LLVMValueRef data[2]; /* data[0] is source data (vector); data[1] is cmp for cmpswap */ + LLVMValueRef offset; + LLVMValueRef bias; + LLVMValueRef compare; + LLVMValueRef derivs[6]; + LLVMValueRef coords[4]; + LLVMValueRef lod; // also used by ac_image_get_resinfo + LLVMValueRef min_lod; }; -LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, - struct ac_image_args *a); -LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, - LLVMValueRef rsrc); -LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]); -LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]); -LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2]); -LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi); -LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, - LLVMValueRef args[2], unsigned bits, bool hi); +LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a); +LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc); +LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2]); +LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, + bool hi); +LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits, + bool hi); LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1); void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1); -LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, - LLVMValueRef offset, LLVMValueRef width, - bool is_signed); -LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2); -LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, - LLVMValueRef s1, LLVMValueRef s2); +LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset, + LLVMValueRef width, bool is_signed); +LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, + LLVMValueRef s2); +LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1, + LLVMValueRef s2); void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags); -LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); +LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize); LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value); LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0); LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src); LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0); -LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, - LLVMValueRef src0); +LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0); -void ac_optimize_vs_outputs(struct ac_llvm_context *ac, - LLVMValueRef main_fn, - uint8_t *vs_output_param_offset, - uint32_t num_outputs, - uint32_t skip_output_mask, - uint8_t *num_param_exports); +void ac_optimize_vs_outputs(struct ac_llvm_context *ac, LLVMValueRef main_fn, + uint8_t *vs_output_param_offset, uint32_t num_outputs, + uint32_t skip_output_mask, uint8_t *num_param_exports); void ac_init_exec_full_mask(struct ac_llvm_context *ctx); void ac_declare_lds_as_pointer(struct ac_llvm_context *ac); -LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr); -void ac_lds_store(struct ac_llvm_context *ctx, - LLVMValueRef dw_addr, LLVMValueRef value); +LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr); +void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value); -LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, - LLVMTypeRef dst_type, - LLVMValueRef src0); +LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0); LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type); LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type); @@ -633,178 +476,141 @@ void ac_build_else(struct ac_llvm_context *ctx, int lable_id); void ac_build_endif(struct ac_llvm_context *ctx, int lable_id); void ac_build_endloop(struct ac_llvm_context *ctx, int lable_id); void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id); -void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, - int lable_id); -void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, - int lable_id); +void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, int lable_id); +void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, int lable_id); -LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, - const char *name); -LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, - const char *name); +LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name); +LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name); -LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMTypeRef type); +LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type); -LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, - unsigned count); +LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count); -LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, - unsigned rshift, unsigned bitwidth); +LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift, + unsigned bitwidth); -void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, - LLVMValueRef *addr, bool is_array_tex); +void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr, + bool is_array_tex); -LLVMValueRef -ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask); +LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask); -LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, - LLVMValueRef src, LLVMValueRef lane); +LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane); -LLVMValueRef -ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane); +LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane); -LLVMValueRef -ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane); +LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, + LLVMValueRef lane); -LLVMValueRef -ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask); +LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask); -LLVMValueRef -ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); +LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); -LLVMValueRef -ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); +LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op); -LLVMValueRef -ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size); +LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, + unsigned cluster_size); /** * Common arguments for a scan/reduce operation that accumulates per-wave * values across an entire workgroup, while respecting the order of waves. */ struct ac_wg_scan { - bool enable_reduce; - bool enable_exclusive; - bool enable_inclusive; - nir_op op; - LLVMValueRef src; /* clobbered! */ - LLVMValueRef result_reduce; - LLVMValueRef result_exclusive; - LLVMValueRef result_inclusive; - LLVMValueRef extra; - LLVMValueRef waveidx; - LLVMValueRef numwaves; /* only needed for "reduce" operations */ - - /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */ - LLVMValueRef scratch; - unsigned maxwaves; + bool enable_reduce; + bool enable_exclusive; + bool enable_inclusive; + nir_op op; + LLVMValueRef src; /* clobbered! */ + LLVMValueRef result_reduce; + LLVMValueRef result_exclusive; + LLVMValueRef result_inclusive; + LLVMValueRef extra; + LLVMValueRef waveidx; + LLVMValueRef numwaves; /* only needed for "reduce" operations */ + + /* T addrspace(LDS) pointer to the same type as value, at least maxwaves entries */ + LLVMValueRef scratch; + unsigned maxwaves; }; -void -ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -void -ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); +void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws); -LLVMValueRef -ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, - unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3); +LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0, + unsigned lane1, unsigned lane2, unsigned lane3); -LLVMValueRef -ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index); +LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index); -LLVMValueRef -ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); +LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize); -LLVMValueRef -ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); +LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize); -LLVMValueRef -ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); +LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, + unsigned bitsize); -LLVMValueRef -ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); +LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); -LLVMValueRef -ac_build_load_helper_invocation(struct ac_llvm_context *ctx); +LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx); -LLVMValueRef -ac_build_is_helper_invocation(struct ac_llvm_context *ctx); +LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx); -LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, - LLVMValueRef *args, unsigned num_args); +LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args, + unsigned num_args); LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, - LLVMValueRef ptr, LLVMValueRef val, - const char *sync_scope); + LLVMValueRef ptr, LLVMValueRef val, const char *sync_scope); LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef cmp, LLVMValueRef val, - const char *sync_scope); + LLVMValueRef cmp, LLVMValueRef val, const char *sync_scope); -void -ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, - LLVMValueRef stencil, LLVMValueRef samplemask, - struct ac_export_args *args); +void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil, + LLVMValueRef samplemask, struct ac_export_args *args); void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id, - LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt); + LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt); struct ac_ngg_prim { - unsigned num_vertices; - LLVMValueRef isnull; - LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; - LLVMValueRef passthrough; + unsigned num_vertices; + LLVMValueRef isnull; + LLVMValueRef index[3]; + LLVMValueRef edgeflag[3]; + LLVMValueRef passthrough; }; -LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, - const struct ac_ngg_prim *prim); -void ac_build_export_prim(struct ac_llvm_context *ctx, - const struct ac_ngg_prim *prim); +LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim); +void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim); -static inline LLVMValueRef -ac_get_arg(struct ac_llvm_context *ctx, struct ac_arg arg) +static inline LLVMValueRef ac_get_arg(struct ac_llvm_context *ctx, struct ac_arg arg) { - assert(arg.used); - return LLVMGetParam(ctx->main_function, arg.arg_index); + assert(arg.used); + return LLVMGetParam(ctx->main_function, arg.arg_index); } -enum ac_llvm_calling_convention { - AC_LLVM_AMDGPU_VS = 87, - AC_LLVM_AMDGPU_GS = 88, - AC_LLVM_AMDGPU_PS = 89, - AC_LLVM_AMDGPU_CS = 90, - AC_LLVM_AMDGPU_HS = 93, +enum ac_llvm_calling_convention +{ + AC_LLVM_AMDGPU_VS = 87, + AC_LLVM_AMDGPU_GS = 88, + AC_LLVM_AMDGPU_PS = 89, + AC_LLVM_AMDGPU_CS = 90, + AC_LLVM_AMDGPU_HS = 93, }; -LLVMValueRef ac_build_main(const struct ac_shader_args *args, - struct ac_llvm_context *ctx, - enum ac_llvm_calling_convention convention, - const char *name, LLVMTypeRef ret_type, - LLVMModuleRef module); +LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx, + enum ac_llvm_calling_convention convention, const char *name, + LLVMTypeRef ret_type, LLVMModuleRef module); void ac_build_s_endpgm(struct ac_llvm_context *ctx); -LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, - LLVMValueRef mask, LLVMValueRef index); -LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, - LLVMValueRef mask[2], LLVMValueRef index); -void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, - LLVMValueRef is_odd, - LLVMValueRef flatshade_first, - LLVMValueRef index[3]); +LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index); +LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2], + LLVMValueRef index); +void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd, + LLVMValueRef flatshade_first, + LLVMValueRef index[3]); #ifdef __cplusplus } diff --git a/src/amd/llvm/ac_llvm_cull.c b/src/amd/llvm/ac_llvm_cull.c index c76d4e1f937..a6ed2680927 100644 --- a/src/amd/llvm/ac_llvm_cull.c +++ b/src/amd/llvm/ac_llvm_cull.c @@ -24,205 +24,188 @@ */ #include "ac_llvm_cull.h" + #include struct ac_position_w_info { - /* If a primitive intersects the W=0 plane, it causes a reflection - * of the determinant used for face culling. Every vertex behind - * the W=0 plane negates the determinant, so having 2 vertices behind - * the plane has no effect. This is i1 true if the determinant should be - * negated. - */ - LLVMValueRef w_reflection; - - /* If we simplify the "-w <= p <= w" view culling equation, we get - * "-w <= w", which can't be satisfied when w is negative. - * In perspective projection, a negative W means that the primitive - * is behind the viewer, but the equation is independent of the type - * of projection. - * - * w_accepted is false when all W are negative and therefore - * the primitive is invisible. - */ - LLVMValueRef w_accepted; - - LLVMValueRef all_w_positive; - LLVMValueRef any_w_negative; + /* If a primitive intersects the W=0 plane, it causes a reflection + * of the determinant used for face culling. Every vertex behind + * the W=0 plane negates the determinant, so having 2 vertices behind + * the plane has no effect. This is i1 true if the determinant should be + * negated. + */ + LLVMValueRef w_reflection; + + /* If we simplify the "-w <= p <= w" view culling equation, we get + * "-w <= w", which can't be satisfied when w is negative. + * In perspective projection, a negative W means that the primitive + * is behind the viewer, but the equation is independent of the type + * of projection. + * + * w_accepted is false when all W are negative and therefore + * the primitive is invisible. + */ + LLVMValueRef w_accepted; + + LLVMValueRef all_w_positive; + LLVMValueRef any_w_negative; }; -static void ac_analyze_position_w(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - struct ac_position_w_info *w) +static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], + struct ac_position_w_info *w) { - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef all_w_negative = ctx->i1true; - - w->w_reflection = ctx->i1false; - w->any_w_negative = ctx->i1false; - - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef neg_w; - - neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); - /* If neg_w is true, negate w_reflection. */ - w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, ""); - w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, ""); - all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, ""); - } - w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, ""); - w->w_accepted = LLVMBuildNot(builder, all_w_negative, ""); + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef all_w_negative = ctx->i1true; + + w->w_reflection = ctx->i1false; + w->any_w_negative = ctx->i1false; + + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef neg_w; + + neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); + /* If neg_w is true, negate w_reflection. */ + w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, ""); + w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, ""); + all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, ""); + } + w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, ""); + w->w_accepted = LLVMBuildNot(builder, all_w_negative, ""); } /* Perform front/back face culling and return true if the primitive is accepted. */ -static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - struct ac_position_w_info *w, - bool cull_front, - bool cull_back, - bool cull_zero_area) +static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], + struct ac_position_w_info *w, bool cull_front, bool cull_back, + bool cull_zero_area) { - LLVMBuilderRef builder = ctx->builder; - - if (cull_front && cull_back) - return ctx->i1false; - - if (!cull_front && !cull_back && !cull_zero_area) - return ctx->i1true; - - /* Front/back face culling. Also if the determinant == 0, the triangle - * area is 0. - */ - LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], ""); - LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], ""); - LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], ""); - LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], ""); - LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, ""); - LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, ""); - LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, ""); - - /* Negative W negates the determinant. */ - det = LLVMBuildSelect(builder, w->w_reflection, - LLVMBuildFNeg(builder, det, ""), - det, ""); - - LLVMValueRef accepted = NULL; - if (cull_front) { - LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE; - accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); - } else if (cull_back) { - LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE; - accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); - } else if (cull_zero_area) { - accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, ""); - } - return accepted; + LLVMBuilderRef builder = ctx->builder; + + if (cull_front && cull_back) + return ctx->i1false; + + if (!cull_front && !cull_back && !cull_zero_area) + return ctx->i1true; + + /* Front/back face culling. Also if the determinant == 0, the triangle + * area is 0. + */ + LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], ""); + LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], ""); + LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], ""); + LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], ""); + LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, ""); + LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, ""); + LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, ""); + + /* Negative W negates the determinant. */ + det = LLVMBuildSelect(builder, w->w_reflection, LLVMBuildFNeg(builder, det, ""), det, ""); + + LLVMValueRef accepted = NULL; + if (cull_front) { + LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE; + accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); + } else if (cull_back) { + LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE; + accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); + } else if (cull_zero_area) { + accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, ""); + } + return accepted; } /* Perform view culling and small primitive elimination and return true * if the primitive is accepted and initially_accepted == true. */ -static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, - struct ac_position_w_info *w, - LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - bool cull_view_xy, - bool cull_view_near_z, - bool cull_view_far_z, - bool cull_small_prims, - bool use_halfz_clip_space) +static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, struct ac_position_w_info *w, + LLVMValueRef vp_scale[2], LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, bool cull_view_xy, + bool cull_view_near_z, bool cull_view_far_z, bool cull_small_prims, + bool use_halfz_clip_space) { - LLVMBuilderRef builder = ctx->builder; - - if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims) - return initially_accepted; - - /* Skip the culling if the primitive has already been rejected or - * if any W is negative. The bounding box culling doesn't work when - * W is negative. - */ - LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted, - w->all_w_positive, ""); - LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, ""); - LLVMBuildStore(builder, initially_accepted, accepted_var); - - ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */); - { - LLVMValueRef bbox_min[3], bbox_max[3]; - LLVMValueRef accepted = initially_accepted; - - /* Compute the primitive bounding box for easy culling. */ - for (unsigned chan = 0; chan < (cull_view_near_z || cull_view_far_z ? 3 : 2); chan++) { - bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); - bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); - - bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); - bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); - } - - /* View culling. */ - if (cull_view_xy || cull_view_near_z || cull_view_far_z) { - for (unsigned chan = 0; chan < 3; chan++) { - LLVMValueRef visible; - - if ((cull_view_xy && chan <= 1) || - (cull_view_near_z && chan == 2)) { - float t = chan == 2 && use_halfz_clip_space ? 0 : -1; - visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan], - LLVMConstReal(ctx->f32, t), ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - - if ((cull_view_xy && chan <= 1) || - (cull_view_far_z && chan == 2)) { - visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], - ctx->f32_1, ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - } - } - - /* Small primitive elimination. */ - if (cull_small_prims) { - /* Assuming a sample position at (0.5, 0.5), if we round - * the bounding box min/max extents and the results of - * the rounding are equal in either the X or Y direction, - * the bounding box does not intersect the sample. - * - * See these GDC slides for pictures: - * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf - */ - LLVMValueRef min, max, not_equal[2], visible; - - for (unsigned chan = 0; chan < 2; chan++) { - /* Convert the position to screen-space coordinates. */ - min = ac_build_fmad(ctx, bbox_min[chan], - vp_scale[chan], vp_translate[chan]); - max = ac_build_fmad(ctx, bbox_max[chan], - vp_scale[chan], vp_translate[chan]); - /* Scale the bounding box according to the precision of - * the rasterizer and the number of MSAA samples. */ - min = LLVMBuildFSub(builder, min, small_prim_precision, ""); - max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); - - /* Determine if the bbox intersects the sample point. - * It also works for MSAA, but vp_scale, vp_translate, - * and small_prim_precision are computed differently. - */ - min = ac_build_round(ctx, min); - max = ac_build_round(ctx, max); - not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); - } - visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - - LLVMBuildStore(builder, accepted, accepted_var); - } - ac_build_endif(ctx, 10000000); - - return LLVMBuildLoad(builder, accepted_var, ""); + LLVMBuilderRef builder = ctx->builder; + + if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims) + return initially_accepted; + + /* Skip the culling if the primitive has already been rejected or + * if any W is negative. The bounding box culling doesn't work when + * W is negative. + */ + LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted, w->all_w_positive, ""); + LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, ""); + LLVMBuildStore(builder, initially_accepted, accepted_var); + + ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */); + { + LLVMValueRef bbox_min[3], bbox_max[3]; + LLVMValueRef accepted = initially_accepted; + + /* Compute the primitive bounding box for easy culling. */ + for (unsigned chan = 0; chan < (cull_view_near_z || cull_view_far_z ? 3 : 2); chan++) { + bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); + bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); + + bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); + bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); + } + + /* View culling. */ + if (cull_view_xy || cull_view_near_z || cull_view_far_z) { + for (unsigned chan = 0; chan < 3; chan++) { + LLVMValueRef visible; + + if ((cull_view_xy && chan <= 1) || (cull_view_near_z && chan == 2)) { + float t = chan == 2 && use_halfz_clip_space ? 0 : -1; + visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan], + LLVMConstReal(ctx->f32, t), ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + + if ((cull_view_xy && chan <= 1) || (cull_view_far_z && chan == 2)) { + visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], ctx->f32_1, ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + } + } + + /* Small primitive elimination. */ + if (cull_small_prims) { + /* Assuming a sample position at (0.5, 0.5), if we round + * the bounding box min/max extents and the results of + * the rounding are equal in either the X or Y direction, + * the bounding box does not intersect the sample. + * + * See these GDC slides for pictures: + * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf + */ + LLVMValueRef min, max, not_equal[2], visible; + + for (unsigned chan = 0; chan < 2; chan++) { + /* Convert the position to screen-space coordinates. */ + min = ac_build_fmad(ctx, bbox_min[chan], vp_scale[chan], vp_translate[chan]); + max = ac_build_fmad(ctx, bbox_max[chan], vp_scale[chan], vp_translate[chan]); + /* Scale the bounding box according to the precision of + * the rasterizer and the number of MSAA samples. */ + min = LLVMBuildFSub(builder, min, small_prim_precision, ""); + max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); + + /* Determine if the bbox intersects the sample point. + * It also works for MSAA, but vp_scale, vp_translate, + * and small_prim_precision are computed differently. + */ + min = ac_build_round(ctx, min); + max = ac_build_round(ctx, max); + not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); + } + visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + + LLVMBuildStore(builder, accepted, accepted_var); + } + ac_build_endif(ctx, 10000000); + + return LLVMBuildLoad(builder, accepted_var, ""); } /** @@ -241,35 +224,27 @@ static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx, * subpixel_bits are defined by the quantization mode. * \param options See ac_cull_options. */ -LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, - LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - struct ac_cull_options *options) +LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision, + struct ac_cull_options *options) { - struct ac_position_w_info w; - ac_analyze_position_w(ctx, pos, &w); - - /* W culling. */ - LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; - accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, ""); - - /* Face culling. */ - accepted = LLVMBuildAnd(ctx->builder, accepted, - ac_cull_face(ctx, pos, &w, - options->cull_front, - options->cull_back, - options->cull_zero_area), ""); - - /* View culling and small primitive elimination. */ - accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, - small_prim_precision, - options->cull_view_xy, - options->cull_view_near_z, - options->cull_view_far_z, - options->cull_small_prims, - options->use_halfz_clip_space); - return accepted; + struct ac_position_w_info w; + ac_analyze_position_w(ctx, pos, &w); + + /* W culling. */ + LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; + accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, ""); + + /* Face culling. */ + accepted = LLVMBuildAnd( + ctx->builder, accepted, + ac_cull_face(ctx, pos, &w, options->cull_front, options->cull_back, options->cull_zero_area), + ""); + + /* View culling and small primitive elimination. */ + accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision, + options->cull_view_xy, options->cull_view_near_z, options->cull_view_far_z, + options->cull_small_prims, options->use_halfz_clip_space); + return accepted; } diff --git a/src/amd/llvm/ac_llvm_cull.h b/src/amd/llvm/ac_llvm_cull.h index 0aa6c902a68..2c4b7f7da05 100644 --- a/src/amd/llvm/ac_llvm_cull.h +++ b/src/amd/llvm/ac_llvm_cull.h @@ -29,31 +29,28 @@ #include "ac_llvm_build.h" struct ac_cull_options { - /* In general, I recommend setting all to true except view Z culling, - * which isn't so effective because W culling is cheaper and partially - * replaces near Z culling, and you don't need to set Position.z - * if Z culling is disabled. - * - * If something doesn't work, turn some of these off to find out what. - */ - bool cull_front; - bool cull_back; - bool cull_view_xy; - bool cull_view_near_z; - bool cull_view_far_z; - bool cull_small_prims; - bool cull_zero_area; - bool cull_w; /* cull primitives with all W < 0 */ + /* In general, I recommend setting all to true except view Z culling, + * which isn't so effective because W culling is cheaper and partially + * replaces near Z culling, and you don't need to set Position.z + * if Z culling is disabled. + * + * If something doesn't work, turn some of these off to find out what. + */ + bool cull_front; + bool cull_back; + bool cull_view_xy; + bool cull_view_near_z; + bool cull_view_far_z; + bool cull_small_prims; + bool cull_zero_area; + bool cull_w; /* cull primitives with all W < 0 */ - bool use_halfz_clip_space; + bool use_halfz_clip_space; }; -LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, - LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, - LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - struct ac_cull_options *options); +LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision, + struct ac_cull_options *options); #endif diff --git a/src/amd/llvm/ac_llvm_helper.cpp b/src/amd/llvm/ac_llvm_helper.cpp index 184f76a7968..f9b70e3f38c 100644 --- a/src/amd/llvm/ac_llvm_helper.cpp +++ b/src/amd/llvm/ac_llvm_helper.cpp @@ -23,15 +23,14 @@ * */ -#include - #include -#include -#include #include +#include +#include +#include #include -#include +#include /* DO NOT REORDER THE HEADERS * The LLVM headers need to all be included before any Mesa header, @@ -42,7 +41,6 @@ #include "ac_binary.h" #include "ac_llvm_util.h" #include "ac_llvm_build.h" - #include "util/macros.h" void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes) @@ -54,36 +52,36 @@ void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes) void ac_add_attr_alignment(LLVMValueRef val, uint64_t bytes) { #if LLVM_VERSION_MAJOR >= 10 - llvm::Argument *A = llvm::unwrap(val); - A->addAttr(llvm::Attribute::getWithAlignment(A->getContext(), llvm::Align(bytes))); + llvm::Argument *A = llvm::unwrap(val); + A->addAttr(llvm::Attribute::getWithAlignment(A->getContext(), llvm::Align(bytes))); #else - /* Avoid unused parameter warnings. */ - (void)val; - (void)bytes; + /* Avoid unused parameter warnings. */ + (void)val; + (void)bytes; #endif } bool ac_is_sgpr_param(LLVMValueRef arg) { - llvm::Argument *A = llvm::unwrap(arg); - llvm::AttributeList AS = A->getParent()->getAttributes(); - unsigned ArgNo = A->getArgNo(); - return AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg); + llvm::Argument *A = llvm::unwrap(arg); + llvm::AttributeList AS = A->getParent()->getAttributes(); + unsigned ArgNo = A->getArgNo(); + return AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg); } LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call) { - return LLVMGetCalledValue(call); + return LLVMGetCalledValue(call); } bool ac_llvm_is_function(LLVMValueRef v) { - return LLVMGetValueKind(v) == LLVMFunctionValueKind; + return LLVMGetValueKind(v) == LLVMFunctionValueKind; } LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx) { - llvm::TargetMachine *TM = reinterpret_cast(tm); + llvm::TargetMachine *TM = reinterpret_cast(tm); LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx); llvm::unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple()); @@ -91,246 +89,243 @@ LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx) return module; } -LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, - enum ac_float_mode float_mode) +LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode) { - LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx); + LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx); - llvm::FastMathFlags flags; + llvm::FastMathFlags flags; - switch (float_mode) { - case AC_FLOAT_MODE_DEFAULT: - case AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO: - break; + switch (float_mode) { + case AC_FLOAT_MODE_DEFAULT: + case AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO: + break; - case AC_FLOAT_MODE_DEFAULT_OPENGL: - /* Allow optimizations to treat the sign of a zero argument or - * result as insignificant. - */ - flags.setNoSignedZeros(); /* nsz */ + case AC_FLOAT_MODE_DEFAULT_OPENGL: + /* Allow optimizations to treat the sign of a zero argument or + * result as insignificant. + */ + flags.setNoSignedZeros(); /* nsz */ - /* Allow optimizations to use the reciprocal of an argument - * rather than perform division. - */ - flags.setAllowReciprocal(); /* arcp */ + /* Allow optimizations to use the reciprocal of an argument + * rather than perform division. + */ + flags.setAllowReciprocal(); /* arcp */ - llvm::unwrap(builder)->setFastMathFlags(flags); - break; - } + llvm::unwrap(builder)->setFastMathFlags(flags); + break; + } - return builder; + return builder; } void ac_enable_signed_zeros(struct ac_llvm_context *ctx) { - if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) { - auto *b = llvm::unwrap(ctx->builder); - llvm::FastMathFlags flags = b->getFastMathFlags(); - - /* This disables the optimization of (x + 0), which is used - * to convert negative zero to positive zero. - */ - flags.setNoSignedZeros(false); - b->setFastMathFlags(flags); - } + if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) { + auto *b = llvm::unwrap(ctx->builder); + llvm::FastMathFlags flags = b->getFastMathFlags(); + + /* This disables the optimization of (x + 0), which is used + * to convert negative zero to positive zero. + */ + flags.setNoSignedZeros(false); + b->setFastMathFlags(flags); + } } void ac_disable_signed_zeros(struct ac_llvm_context *ctx) { - if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) { - auto *b = llvm::unwrap(ctx->builder); - llvm::FastMathFlags flags = b->getFastMathFlags(); + if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) { + auto *b = llvm::unwrap(ctx->builder); + llvm::FastMathFlags flags = b->getFastMathFlags(); - flags.setNoSignedZeros(); - b->setFastMathFlags(flags); - } + flags.setNoSignedZeros(); + b->setFastMathFlags(flags); + } } -LLVMTargetLibraryInfoRef -ac_create_target_library_info(const char *triple) +LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple) { - return reinterpret_cast(new llvm::TargetLibraryInfoImpl(llvm::Triple(triple))); + return reinterpret_cast( + new llvm::TargetLibraryInfoImpl(llvm::Triple(triple))); } -void -ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info) +void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info) { - delete reinterpret_cast(library_info); + delete reinterpret_cast(library_info); } /* Implementation of raw_pwrite_stream that works on malloc()ed memory for * better compatibility with C code. */ struct raw_memory_ostream : public llvm::raw_pwrite_stream { - char *buffer; - size_t written; - size_t bufsize; - - raw_memory_ostream() - { - buffer = NULL; - written = 0; - bufsize = 0; - SetUnbuffered(); - } - - ~raw_memory_ostream() - { - free(buffer); - } - - void clear() - { - written = 0; - } - - void take(char *&out_buffer, size_t &out_size) - { - out_buffer = buffer; - out_size = written; - buffer = NULL; - written = 0; - bufsize = 0; - } - - void flush() = delete; - - void write_impl(const char *ptr, size_t size) override - { - if (unlikely(written + size < written)) - abort(); - if (written + size > bufsize) { - bufsize = MAX3(1024, written + size, bufsize / 3 * 4); - buffer = (char *)realloc(buffer, bufsize); - if (!buffer) { - fprintf(stderr, "amd: out of memory allocating ELF buffer\n"); - abort(); - } - } - memcpy(buffer + written, ptr, size); - written += size; - } - - void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override - { - assert(offset == (size_t)offset && - offset + size >= offset && offset + size <= written); - memcpy(buffer + offset, ptr, size); - } - - uint64_t current_pos() const override - { - return written; - } + char *buffer; + size_t written; + size_t bufsize; + + raw_memory_ostream() + { + buffer = NULL; + written = 0; + bufsize = 0; + SetUnbuffered(); + } + + ~raw_memory_ostream() + { + free(buffer); + } + + void clear() + { + written = 0; + } + + void take(char *&out_buffer, size_t &out_size) + { + out_buffer = buffer; + out_size = written; + buffer = NULL; + written = 0; + bufsize = 0; + } + + void flush() = delete; + + void write_impl(const char *ptr, size_t size) override + { + if (unlikely(written + size < written)) + abort(); + if (written + size > bufsize) { + bufsize = MAX3(1024, written + size, bufsize / 3 * 4); + buffer = (char *)realloc(buffer, bufsize); + if (!buffer) { + fprintf(stderr, "amd: out of memory allocating ELF buffer\n"); + abort(); + } + } + memcpy(buffer + written, ptr, size); + written += size; + } + + void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override + { + assert(offset == (size_t)offset && offset + size >= offset && offset + size <= written); + memcpy(buffer + offset, ptr, size); + } + + uint64_t current_pos() const override + { + return written; + } }; /* The LLVM compiler is represented as a pass manager containing passes for * optimizations, instruction selection, and code generation. */ struct ac_compiler_passes { - raw_memory_ostream ostream; /* ELF shader binary stream */ - llvm::legacy::PassManager passmgr; /* list of passes */ + raw_memory_ostream ostream; /* ELF shader binary stream */ + llvm::legacy::PassManager passmgr; /* list of passes */ }; struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm) { - struct ac_compiler_passes *p = new ac_compiler_passes(); - if (!p) - return NULL; + struct ac_compiler_passes *p = new ac_compiler_passes(); + if (!p) + return NULL; - llvm::TargetMachine *TM = reinterpret_cast(tm); + llvm::TargetMachine *TM = reinterpret_cast(tm); - if (TM->addPassesToEmitFile(p->passmgr, p->ostream, - nullptr, + if (TM->addPassesToEmitFile(p->passmgr, p->ostream, nullptr, #if LLVM_VERSION_MAJOR >= 10 - llvm::CGFT_ObjectFile)) { + llvm::CGFT_ObjectFile)) { #else - llvm::TargetMachine::CGFT_ObjectFile)) { + llvm::TargetMachine::CGFT_ObjectFile)) { #endif - fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n"); - delete p; - return NULL; - } - return p; + fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n"); + delete p; + return NULL; + } + return p; } void ac_destroy_llvm_passes(struct ac_compiler_passes *p) { - delete p; + delete p; } /* This returns false on failure. */ bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module, - char **pelf_buffer, size_t *pelf_size) + char **pelf_buffer, size_t *pelf_size) { - p->passmgr.run(*llvm::unwrap(module)); - p->ostream.take(*pelf_buffer, *pelf_size); - return true; + p->passmgr.run(*llvm::unwrap(module)); + p->ostream.take(*pelf_buffer, *pelf_size); + return true; } void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr) { - llvm::unwrap(passmgr)->add(llvm::createBarrierNoopPass()); + llvm::unwrap(passmgr)->add(llvm::createBarrierNoopPass()); } void ac_enable_global_isel(LLVMTargetMachineRef tm) { - reinterpret_cast(tm)->setGlobalISel(true); + reinterpret_cast(tm)->setGlobalISel(true); } LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, - LLVMValueRef ptr, LLVMValueRef val, - const char *sync_scope) { - llvm::AtomicRMWInst::BinOp binop; - switch (op) { - case LLVMAtomicRMWBinOpXchg: - binop = llvm::AtomicRMWInst::Xchg; - break; - case LLVMAtomicRMWBinOpAdd: - binop = llvm::AtomicRMWInst::Add; - break; - case LLVMAtomicRMWBinOpSub: - binop = llvm::AtomicRMWInst::Sub; - break; - case LLVMAtomicRMWBinOpAnd: - binop = llvm::AtomicRMWInst::And; - break; - case LLVMAtomicRMWBinOpNand: - binop = llvm::AtomicRMWInst::Nand; - break; - case LLVMAtomicRMWBinOpOr: - binop = llvm::AtomicRMWInst::Or; - break; - case LLVMAtomicRMWBinOpXor: - binop = llvm::AtomicRMWInst::Xor; - break; - case LLVMAtomicRMWBinOpMax: - binop = llvm::AtomicRMWInst::Max; - break; - case LLVMAtomicRMWBinOpMin: - binop = llvm::AtomicRMWInst::Min; - break; - case LLVMAtomicRMWBinOpUMax: - binop = llvm::AtomicRMWInst::UMax; - break; - case LLVMAtomicRMWBinOpUMin: - binop = llvm::AtomicRMWInst::UMin; - break; - default: - unreachable(!"invalid LLVMAtomicRMWBinOp"); - break; - } - unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); - return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW( - binop, llvm::unwrap(ptr), llvm::unwrap(val), - llvm::AtomicOrdering::SequentiallyConsistent, SSID)); + LLVMValueRef ptr, LLVMValueRef val, const char *sync_scope) +{ + llvm::AtomicRMWInst::BinOp binop; + switch (op) { + case LLVMAtomicRMWBinOpXchg: + binop = llvm::AtomicRMWInst::Xchg; + break; + case LLVMAtomicRMWBinOpAdd: + binop = llvm::AtomicRMWInst::Add; + break; + case LLVMAtomicRMWBinOpSub: + binop = llvm::AtomicRMWInst::Sub; + break; + case LLVMAtomicRMWBinOpAnd: + binop = llvm::AtomicRMWInst::And; + break; + case LLVMAtomicRMWBinOpNand: + binop = llvm::AtomicRMWInst::Nand; + break; + case LLVMAtomicRMWBinOpOr: + binop = llvm::AtomicRMWInst::Or; + break; + case LLVMAtomicRMWBinOpXor: + binop = llvm::AtomicRMWInst::Xor; + break; + case LLVMAtomicRMWBinOpMax: + binop = llvm::AtomicRMWInst::Max; + break; + case LLVMAtomicRMWBinOpMin: + binop = llvm::AtomicRMWInst::Min; + break; + case LLVMAtomicRMWBinOpUMax: + binop = llvm::AtomicRMWInst::UMax; + break; + case LLVMAtomicRMWBinOpUMin: + binop = llvm::AtomicRMWInst::UMin; + break; + default: + unreachable(!"invalid LLVMAtomicRMWBinOp"); + break; + } + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder) + ->CreateAtomicRMW(binop, llvm::unwrap(ptr), llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); } LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, - LLVMValueRef cmp, LLVMValueRef val, - const char *sync_scope) { - unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); - return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicCmpXchg( - llvm::unwrap(ptr), llvm::unwrap(cmp), llvm::unwrap(val), - llvm::AtomicOrdering::SequentiallyConsistent, - llvm::AtomicOrdering::SequentiallyConsistent, SSID)); + LLVMValueRef cmp, LLVMValueRef val, const char *sync_scope) +{ + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder) + ->CreateAtomicCmpXchg(llvm::unwrap(ptr), llvm::unwrap(cmp), + llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); } diff --git a/src/amd/llvm/ac_llvm_util.c b/src/amd/llvm/ac_llvm_util.c index c7c8b991756..8e220ba6853 100644 --- a/src/amd/llvm/ac_llvm_util.c +++ b/src/amd/llvm/ac_llvm_util.c @@ -24,16 +24,17 @@ */ /* based on pieces from si_pipe.c and radeon_llvm_emit.c */ #include "ac_llvm_util.h" + #include "ac_llvm_build.h" +#include "c11/threads.h" +#include "gallivm/lp_bld_misc.h" #include "util/bitscan.h" +#include "util/u_math.h" #include #include #include #include #include -#include "c11/threads.h" -#include "gallivm/lp_bld_misc.h" -#include "util/u_math.h" #include #include @@ -41,239 +42,240 @@ static void ac_init_llvm_target() { - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); - - /* For inline assembly. */ - LLVMInitializeAMDGPUAsmParser(); - - /* For ACO disassembly. */ - LLVMInitializeAMDGPUDisassembler(); - - /* Workaround for bug in llvm 4.0 that causes image intrinsics - * to disappear. - * https://reviews.llvm.org/D26348 - * - * "mesa" is the prefix for error messages. - * - * -global-isel-abort=2 is a no-op unless global isel has been enabled. - * This option tells the backend to fall-back to SelectionDAG and print - * a diagnostic message if global isel fails. - */ - const char *argv[] = { - "mesa", - "-simplifycfg-sink-common=false", - "-global-isel-abort=2", + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + + /* For inline assembly. */ + LLVMInitializeAMDGPUAsmParser(); + + /* For ACO disassembly. */ + LLVMInitializeAMDGPUDisassembler(); + + /* Workaround for bug in llvm 4.0 that causes image intrinsics + * to disappear. + * https://reviews.llvm.org/D26348 + * + * "mesa" is the prefix for error messages. + * + * -global-isel-abort=2 is a no-op unless global isel has been enabled. + * This option tells the backend to fall-back to SelectionDAG and print + * a diagnostic message if global isel fails. + */ + const char *argv[] = { + "mesa", + "-simplifycfg-sink-common=false", + "-global-isel-abort=2", #if LLVM_VERSION_MAJOR >= 10 - /* Atomic optimizations require LLVM 10.0 for gfx10 support. */ - "-amdgpu-atomic-optimizations=true", + /* Atomic optimizations require LLVM 10.0 for gfx10 support. */ + "-amdgpu-atomic-optimizations=true", #endif #if LLVM_VERSION_MAJOR >= 11 - /* This was disabled by default in: https://reviews.llvm.org/D77228 */ - "-structurizecfg-skip-uniform-regions", + /* This was disabled by default in: https://reviews.llvm.org/D77228 */ + "-structurizecfg-skip-uniform-regions", #endif - }; - LLVMParseCommandLineOptions(ARRAY_SIZE(argv), argv, NULL); + }; + LLVMParseCommandLineOptions(ARRAY_SIZE(argv), argv, NULL); } PUBLIC void ac_init_shared_llvm_once(void) { - static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT; - call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target); + static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT; + call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target); } #if !LLVM_IS_SHARED static once_flag ac_init_static_llvm_target_once_flag = ONCE_FLAG_INIT; static void ac_init_static_llvm_once(void) { - call_once(&ac_init_static_llvm_target_once_flag, ac_init_llvm_target); + call_once(&ac_init_static_llvm_target_once_flag, ac_init_llvm_target); } #endif void ac_init_llvm_once(void) { #if LLVM_IS_SHARED - ac_init_shared_llvm_once(); + ac_init_shared_llvm_once(); #else - ac_init_static_llvm_once(); + ac_init_static_llvm_once(); #endif } static LLVMTargetRef ac_get_llvm_target(const char *triple) { - LLVMTargetRef target = NULL; - char *err_message = NULL; - - if (LLVMGetTargetFromTriple(triple, &target, &err_message)) { - fprintf(stderr, "Cannot find target for triple %s ", triple); - if (err_message) { - fprintf(stderr, "%s\n", err_message); - } - LLVMDisposeMessage(err_message); - return NULL; - } - return target; + LLVMTargetRef target = NULL; + char *err_message = NULL; + + if (LLVMGetTargetFromTriple(triple, &target, &err_message)) { + fprintf(stderr, "Cannot find target for triple %s ", triple); + if (err_message) { + fprintf(stderr, "%s\n", err_message); + } + LLVMDisposeMessage(err_message); + return NULL; + } + return target; } const char *ac_get_llvm_processor_name(enum radeon_family family) { - switch (family) { - case CHIP_TAHITI: - return "tahiti"; - case CHIP_PITCAIRN: - return "pitcairn"; - case CHIP_VERDE: - return "verde"; - case CHIP_OLAND: - return "oland"; - case CHIP_HAINAN: - return "hainan"; - case CHIP_BONAIRE: - return "bonaire"; - case CHIP_KABINI: - return "kabini"; - case CHIP_KAVERI: - return "kaveri"; - case CHIP_HAWAII: - return "hawaii"; - case CHIP_TONGA: - return "tonga"; - case CHIP_ICELAND: - return "iceland"; - case CHIP_CARRIZO: - return "carrizo"; - case CHIP_FIJI: - return "fiji"; - case CHIP_STONEY: - return "stoney"; - case CHIP_POLARIS10: - return "polaris10"; - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - return "polaris11"; - case CHIP_VEGA10: - return "gfx900"; - case CHIP_RAVEN: - return "gfx902"; - case CHIP_VEGA12: - return "gfx904"; - case CHIP_VEGA20: - return "gfx906"; - case CHIP_RAVEN2: - case CHIP_RENOIR: - return "gfx909"; - case CHIP_ARCTURUS: - return "gfx908"; - case CHIP_NAVI10: - return "gfx1010"; - case CHIP_NAVI12: - return "gfx1011"; - case CHIP_NAVI14: - return "gfx1012"; - case CHIP_SIENNA_CICHLID: - case CHIP_NAVY_FLOUNDER: - return "gfx1030"; - default: - return ""; - } + switch (family) { + case CHIP_TAHITI: + return "tahiti"; + case CHIP_PITCAIRN: + return "pitcairn"; + case CHIP_VERDE: + return "verde"; + case CHIP_OLAND: + return "oland"; + case CHIP_HAINAN: + return "hainan"; + case CHIP_BONAIRE: + return "bonaire"; + case CHIP_KABINI: + return "kabini"; + case CHIP_KAVERI: + return "kaveri"; + case CHIP_HAWAII: + return "hawaii"; + case CHIP_TONGA: + return "tonga"; + case CHIP_ICELAND: + return "iceland"; + case CHIP_CARRIZO: + return "carrizo"; + case CHIP_FIJI: + return "fiji"; + case CHIP_STONEY: + return "stoney"; + case CHIP_POLARIS10: + return "polaris10"; + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_VEGAM: + return "polaris11"; + case CHIP_VEGA10: + return "gfx900"; + case CHIP_RAVEN: + return "gfx902"; + case CHIP_VEGA12: + return "gfx904"; + case CHIP_VEGA20: + return "gfx906"; + case CHIP_RAVEN2: + case CHIP_RENOIR: + return "gfx909"; + case CHIP_ARCTURUS: + return "gfx908"; + case CHIP_NAVI10: + return "gfx1010"; + case CHIP_NAVI12: + return "gfx1011"; + case CHIP_NAVI14: + return "gfx1012"; + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: + return "gfx1030"; + default: + return ""; + } } static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, - enum ac_target_machine_options tm_options, - LLVMCodeGenOptLevel level, - const char **out_triple) + enum ac_target_machine_options tm_options, + LLVMCodeGenOptLevel level, + const char **out_triple) { - assert(family >= CHIP_TAHITI); - char features[256]; - const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--"; - LLVMTargetRef target = ac_get_llvm_target(triple); - - snprintf(features, sizeof(features), - "+DumpCode%s%s%s%s%s", - LLVM_VERSION_MAJOR >= 11 ? "" : ",-fp32-denormals,+fp64-denormals", - family >= CHIP_NAVI10 && !(tm_options & AC_TM_WAVE32) ? - ",+wavefrontsize64,-wavefrontsize32" : "", - family <= CHIP_NAVI14 && tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "", - family <= CHIP_NAVI14 && tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "", - tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : ""); - - LLVMTargetMachineRef tm = LLVMCreateTargetMachine( - target, - triple, - ac_get_llvm_processor_name(family), - features, - level, - LLVMRelocDefault, - LLVMCodeModelDefault); - - if (out_triple) - *out_triple = triple; - if (tm_options & AC_TM_ENABLE_GLOBAL_ISEL) - ac_enable_global_isel(tm); - return tm; + assert(family >= CHIP_TAHITI); + char features[256]; + const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--"; + LLVMTargetRef target = ac_get_llvm_target(triple); + + snprintf(features, sizeof(features), "+DumpCode%s%s%s%s%s", + LLVM_VERSION_MAJOR >= 11 ? "" : ",-fp32-denormals,+fp64-denormals", + family >= CHIP_NAVI10 && !(tm_options & AC_TM_WAVE32) + ? ",+wavefrontsize64,-wavefrontsize32" + : "", + family <= CHIP_NAVI14 && tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "", + family <= CHIP_NAVI14 && tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "", + tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : ""); + + LLVMTargetMachineRef tm = + LLVMCreateTargetMachine(target, triple, ac_get_llvm_processor_name(family), features, level, + LLVMRelocDefault, LLVMCodeModelDefault); + + if (out_triple) + *out_triple = triple; + if (tm_options & AC_TM_ENABLE_GLOBAL_ISEL) + ac_enable_global_isel(tm); + return tm; } static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info, - bool check_ir) + bool check_ir) { - LLVMPassManagerRef passmgr = LLVMCreatePassManager(); - if (!passmgr) - return NULL; - - if (target_library_info) - LLVMAddTargetLibraryInfo(target_library_info, - passmgr); - - if (check_ir) - LLVMAddVerifierPass(passmgr); - LLVMAddAlwaysInlinerPass(passmgr); - /* Normally, the pass manager runs all passes on one function before - * moving onto another. Adding a barrier no-op pass forces the pass - * manager to run the inliner on all functions first, which makes sure - * that the following passes are only run on the remaining non-inline - * function, so it removes useless work done on dead inline functions. - */ - ac_llvm_add_barrier_noop_pass(passmgr); - /* This pass should eliminate all the load and store instructions. */ - LLVMAddPromoteMemoryToRegisterPass(passmgr); - LLVMAddScalarReplAggregatesPass(passmgr); - LLVMAddLICMPass(passmgr); - LLVMAddAggressiveDCEPass(passmgr); - LLVMAddCFGSimplificationPass(passmgr); - /* This is recommended by the instruction combining pass. */ - LLVMAddEarlyCSEMemSSAPass(passmgr); - LLVMAddInstructionCombiningPass(passmgr); - return passmgr; + LLVMPassManagerRef passmgr = LLVMCreatePassManager(); + if (!passmgr) + return NULL; + + if (target_library_info) + LLVMAddTargetLibraryInfo(target_library_info, passmgr); + + if (check_ir) + LLVMAddVerifierPass(passmgr); + LLVMAddAlwaysInlinerPass(passmgr); + /* Normally, the pass manager runs all passes on one function before + * moving onto another. Adding a barrier no-op pass forces the pass + * manager to run the inliner on all functions first, which makes sure + * that the following passes are only run on the remaining non-inline + * function, so it removes useless work done on dead inline functions. + */ + ac_llvm_add_barrier_noop_pass(passmgr); + /* This pass should eliminate all the load and store instructions. */ + LLVMAddPromoteMemoryToRegisterPass(passmgr); + LLVMAddScalarReplAggregatesPass(passmgr); + LLVMAddLICMPass(passmgr); + LLVMAddAggressiveDCEPass(passmgr); + LLVMAddCFGSimplificationPass(passmgr); + /* This is recommended by the instruction combining pass. */ + LLVMAddEarlyCSEMemSSAPass(passmgr); + LLVMAddInstructionCombiningPass(passmgr); + return passmgr; } static const char *attr_to_str(enum ac_func_attr attr) { switch (attr) { - case AC_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline"; - case AC_FUNC_ATTR_INREG: return "inreg"; - case AC_FUNC_ATTR_NOALIAS: return "noalias"; - case AC_FUNC_ATTR_NOUNWIND: return "nounwind"; - case AC_FUNC_ATTR_READNONE: return "readnone"; - case AC_FUNC_ATTR_READONLY: return "readonly"; - case AC_FUNC_ATTR_WRITEONLY: return "writeonly"; - case AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly"; - case AC_FUNC_ATTR_CONVERGENT: return "convergent"; + case AC_FUNC_ATTR_ALWAYSINLINE: + return "alwaysinline"; + case AC_FUNC_ATTR_INREG: + return "inreg"; + case AC_FUNC_ATTR_NOALIAS: + return "noalias"; + case AC_FUNC_ATTR_NOUNWIND: + return "nounwind"; + case AC_FUNC_ATTR_READNONE: + return "readnone"; + case AC_FUNC_ATTR_READONLY: + return "readonly"; + case AC_FUNC_ATTR_WRITEONLY: + return "writeonly"; + case AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: + return "inaccessiblememonly"; + case AC_FUNC_ATTR_CONVERGENT: + return "convergent"; default: - fprintf(stderr, "Unhandled function attribute: %x\n", attr); - return 0; + fprintf(stderr, "Unhandled function attribute: %x\n", attr); + return 0; } } -void -ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, - int attr_idx, enum ac_func_attr attr) +void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, int attr_idx, + enum ac_func_attr attr) { const char *attr_name = attr_to_str(attr); - unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name, - strlen(attr_name)); + unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name, strlen(attr_name)); LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0); if (LLVMIsAFunction(function)) @@ -282,138 +284,124 @@ ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, LLVMAddCallSiteAttribute(function, attr_idx, llvm_attr); } -void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, - unsigned attrib_mask) +void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, unsigned attrib_mask) { - attrib_mask |= AC_FUNC_ATTR_NOUNWIND; - attrib_mask &= ~AC_FUNC_ATTR_LEGACY; + attrib_mask |= AC_FUNC_ATTR_NOUNWIND; + attrib_mask &= ~AC_FUNC_ATTR_LEGACY; - while (attrib_mask) { - enum ac_func_attr attr = 1u << u_bit_scan(&attrib_mask); - ac_add_function_attr(ctx, function, -1, attr); - } + while (attrib_mask) { + enum ac_func_attr attr = 1u << u_bit_scan(&attrib_mask); + ac_add_function_attr(ctx, function, -1, attr); + } } -void -ac_dump_module(LLVMModuleRef module) +void ac_dump_module(LLVMModuleRef module) { - char *str = LLVMPrintModuleToString(module); - fprintf(stderr, "%s", str); - LLVMDisposeMessage(str); + char *str = LLVMPrintModuleToString(module); + fprintf(stderr, "%s", str); + LLVMDisposeMessage(str); } -void -ac_llvm_add_target_dep_function_attr(LLVMValueRef F, - const char *name, unsigned value) +void ac_llvm_add_target_dep_function_attr(LLVMValueRef F, const char *name, unsigned value) { - char str[16]; + char str[16]; - snprintf(str, sizeof(str), "0x%x", value); - LLVMAddTargetDependentFunctionAttr(F, name, str); + snprintf(str, sizeof(str), "0x%x", value); + LLVMAddTargetDependentFunctionAttr(F, name, str); } void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size) { - if (!size) - return; + if (!size) + return; - char str[32]; - snprintf(str, sizeof(str), "%u,%u", size, size); - LLVMAddTargetDependentFunctionAttr(F, "amdgpu-flat-work-group-size", str); + char str[32]; + snprintf(str, sizeof(str), "%u,%u", size, size); + LLVMAddTargetDependentFunctionAttr(F, "amdgpu-flat-work-group-size", str); } -unsigned -ac_count_scratch_private_memory(LLVMValueRef function) +unsigned ac_count_scratch_private_memory(LLVMValueRef function) { - unsigned private_mem_vgprs = 0; - - /* Process all LLVM instructions. */ - LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(function); - while (bb) { - LLVMValueRef next = LLVMGetFirstInstruction(bb); - - while (next) { - LLVMValueRef inst = next; - next = LLVMGetNextInstruction(next); - - if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) - continue; - - LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); - /* No idea why LLVM aligns allocas to 4 elements. */ - unsigned alignment = LLVMGetAlignment(inst); - unsigned dw_size = align(ac_get_type_size(type) / 4, alignment); - private_mem_vgprs += dw_size; - } - bb = LLVMGetNextBasicBlock(bb); - } - - return private_mem_vgprs; + unsigned private_mem_vgprs = 0; + + /* Process all LLVM instructions. */ + LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(function); + while (bb) { + LLVMValueRef next = LLVMGetFirstInstruction(bb); + + while (next) { + LLVMValueRef inst = next; + next = LLVMGetNextInstruction(next); + + if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) + continue; + + LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); + /* No idea why LLVM aligns allocas to 4 elements. */ + unsigned alignment = LLVMGetAlignment(inst); + unsigned dw_size = align(ac_get_type_size(type) / 4, alignment); + private_mem_vgprs += dw_size; + } + bb = LLVMGetNextBasicBlock(bb); + } + + return private_mem_vgprs; } -bool -ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, - enum radeon_family family, - enum ac_target_machine_options tm_options) +bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, enum radeon_family family, + enum ac_target_machine_options tm_options) { - const char *triple; - memset(compiler, 0, sizeof(*compiler)); - - compiler->tm = ac_create_target_machine(family, tm_options, - LLVMCodeGenLevelDefault, - &triple); - if (!compiler->tm) - return false; - - if (tm_options & AC_TM_CREATE_LOW_OPT) { - compiler->low_opt_tm = - ac_create_target_machine(family, tm_options, - LLVMCodeGenLevelLess, NULL); - if (!compiler->low_opt_tm) - goto fail; - } - - if (family >= CHIP_NAVI10) { - assert(!(tm_options & AC_TM_CREATE_LOW_OPT)); - compiler->tm_wave32 = ac_create_target_machine(family, - tm_options | AC_TM_WAVE32, - LLVMCodeGenLevelDefault, - NULL); - if (!compiler->tm_wave32) - goto fail; - } - - compiler->target_library_info = - ac_create_target_library_info(triple); - if (!compiler->target_library_info) - goto fail; - - compiler->passmgr = ac_create_passmgr(compiler->target_library_info, - tm_options & AC_TM_CHECK_IR); - if (!compiler->passmgr) - goto fail; - - return true; + const char *triple; + memset(compiler, 0, sizeof(*compiler)); + + compiler->tm = ac_create_target_machine(family, tm_options, LLVMCodeGenLevelDefault, &triple); + if (!compiler->tm) + return false; + + if (tm_options & AC_TM_CREATE_LOW_OPT) { + compiler->low_opt_tm = + ac_create_target_machine(family, tm_options, LLVMCodeGenLevelLess, NULL); + if (!compiler->low_opt_tm) + goto fail; + } + + if (family >= CHIP_NAVI10) { + assert(!(tm_options & AC_TM_CREATE_LOW_OPT)); + compiler->tm_wave32 = + ac_create_target_machine(family, tm_options | AC_TM_WAVE32, LLVMCodeGenLevelDefault, NULL); + if (!compiler->tm_wave32) + goto fail; + } + + compiler->target_library_info = ac_create_target_library_info(triple); + if (!compiler->target_library_info) + goto fail; + + compiler->passmgr = + ac_create_passmgr(compiler->target_library_info, tm_options & AC_TM_CHECK_IR); + if (!compiler->passmgr) + goto fail; + + return true; fail: - ac_destroy_llvm_compiler(compiler); - return false; + ac_destroy_llvm_compiler(compiler); + return false; } -void -ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) +void ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) { - ac_destroy_llvm_passes(compiler->passes); - ac_destroy_llvm_passes(compiler->passes_wave32); - ac_destroy_llvm_passes(compiler->low_opt_passes); - - if (compiler->passmgr) - LLVMDisposePassManager(compiler->passmgr); - if (compiler->target_library_info) - ac_dispose_target_library_info(compiler->target_library_info); - if (compiler->low_opt_tm) - LLVMDisposeTargetMachine(compiler->low_opt_tm); - if (compiler->tm) - LLVMDisposeTargetMachine(compiler->tm); - if (compiler->tm_wave32) - LLVMDisposeTargetMachine(compiler->tm_wave32); + ac_destroy_llvm_passes(compiler->passes); + ac_destroy_llvm_passes(compiler->passes_wave32); + ac_destroy_llvm_passes(compiler->low_opt_passes); + + if (compiler->passmgr) + LLVMDisposePassManager(compiler->passmgr); + if (compiler->target_library_info) + ac_dispose_target_library_info(compiler->target_library_info); + if (compiler->low_opt_tm) + LLVMDisposeTargetMachine(compiler->low_opt_tm); + if (compiler->tm) + LLVMDisposeTargetMachine(compiler->tm); + if (compiler->tm_wave32) + LLVMDisposeTargetMachine(compiler->tm_wave32); } diff --git a/src/amd/llvm/ac_llvm_util.h b/src/amd/llvm/ac_llvm_util.h index d44d4deab87..4b1754ea3b8 100644 --- a/src/amd/llvm/ac_llvm_util.h +++ b/src/amd/llvm/ac_llvm_util.h @@ -26,11 +26,11 @@ #ifndef AC_LLVM_UTIL_H #define AC_LLVM_UTIL_H -#include +#include "amd_family.h" #include #include -#include "amd_family.h" +#include #ifdef __cplusplus extern "C" { @@ -39,124 +39,117 @@ extern "C" { struct ac_compiler_passes; struct ac_llvm_context; -enum ac_func_attr { - AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0), - AC_FUNC_ATTR_INREG = (1 << 2), - AC_FUNC_ATTR_NOALIAS = (1 << 3), - AC_FUNC_ATTR_NOUNWIND = (1 << 4), - AC_FUNC_ATTR_READNONE = (1 << 5), - AC_FUNC_ATTR_READONLY = (1 << 6), - AC_FUNC_ATTR_WRITEONLY = (1 << 7), - AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = (1 << 8), - AC_FUNC_ATTR_CONVERGENT = (1 << 9), - - /* Legacy intrinsic that needs attributes on function declarations - * and they must match the internal LLVM definition exactly, otherwise - * intrinsic selection fails. - */ - AC_FUNC_ATTR_LEGACY = (1u << 31), +enum ac_func_attr +{ + AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0), + AC_FUNC_ATTR_INREG = (1 << 2), + AC_FUNC_ATTR_NOALIAS = (1 << 3), + AC_FUNC_ATTR_NOUNWIND = (1 << 4), + AC_FUNC_ATTR_READNONE = (1 << 5), + AC_FUNC_ATTR_READONLY = (1 << 6), + AC_FUNC_ATTR_WRITEONLY = (1 << 7), + AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = (1 << 8), + AC_FUNC_ATTR_CONVERGENT = (1 << 9), + + /* Legacy intrinsic that needs attributes on function declarations + * and they must match the internal LLVM definition exactly, otherwise + * intrinsic selection fails. + */ + AC_FUNC_ATTR_LEGACY = (1u << 31), }; -enum ac_target_machine_options { - AC_TM_SUPPORTS_SPILL = (1 << 0), - AC_TM_FORCE_ENABLE_XNACK = (1 << 1), - AC_TM_FORCE_DISABLE_XNACK = (1 << 2), - AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 3), - AC_TM_CHECK_IR = (1 << 4), - AC_TM_ENABLE_GLOBAL_ISEL = (1 << 5), - AC_TM_CREATE_LOW_OPT = (1 << 6), - AC_TM_WAVE32 = (1 << 7), +enum ac_target_machine_options +{ + AC_TM_SUPPORTS_SPILL = (1 << 0), + AC_TM_FORCE_ENABLE_XNACK = (1 << 1), + AC_TM_FORCE_DISABLE_XNACK = (1 << 2), + AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 3), + AC_TM_CHECK_IR = (1 << 4), + AC_TM_ENABLE_GLOBAL_ISEL = (1 << 5), + AC_TM_CREATE_LOW_OPT = (1 << 6), + AC_TM_WAVE32 = (1 << 7), }; -enum ac_float_mode { - AC_FLOAT_MODE_DEFAULT, - AC_FLOAT_MODE_DEFAULT_OPENGL, - AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO, +enum ac_float_mode +{ + AC_FLOAT_MODE_DEFAULT, + AC_FLOAT_MODE_DEFAULT_OPENGL, + AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO, }; /* Per-thread persistent LLVM objects. */ struct ac_llvm_compiler { - LLVMTargetLibraryInfoRef target_library_info; - LLVMPassManagerRef passmgr; - - /* Default compiler. */ - LLVMTargetMachineRef tm; - struct ac_compiler_passes *passes; - - /* Wave32 compiler for GFX10. */ - LLVMTargetMachineRef tm_wave32; - struct ac_compiler_passes *passes_wave32; - - /* Optional compiler for faster compilation with fewer optimizations. - * LLVM modules can be created with "tm" too. There is no difference. - */ - LLVMTargetMachineRef low_opt_tm; /* uses -O1 instead of -O2 */ - struct ac_compiler_passes *low_opt_passes; + LLVMTargetLibraryInfoRef target_library_info; + LLVMPassManagerRef passmgr; + + /* Default compiler. */ + LLVMTargetMachineRef tm; + struct ac_compiler_passes *passes; + + /* Wave32 compiler for GFX10. */ + LLVMTargetMachineRef tm_wave32; + struct ac_compiler_passes *passes_wave32; + + /* Optional compiler for faster compilation with fewer optimizations. + * LLVM modules can be created with "tm" too. There is no difference. + */ + LLVMTargetMachineRef low_opt_tm; /* uses -O1 instead of -O2 */ + struct ac_compiler_passes *low_opt_passes; }; const char *ac_get_llvm_processor_name(enum radeon_family family); void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes); void ac_add_attr_alignment(LLVMValueRef val, uint64_t bytes); bool ac_is_sgpr_param(LLVMValueRef param); -void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, - int attr_idx, enum ac_func_attr attr); -void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, - unsigned attrib_mask); +void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function, int attr_idx, + enum ac_func_attr attr); +void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function, unsigned attrib_mask); void ac_dump_module(LLVMModuleRef module); LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call); bool ac_llvm_is_function(LLVMValueRef v); LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx); -LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, - enum ac_float_mode float_mode); +LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode); void ac_enable_signed_zeros(struct ac_llvm_context *ctx); void ac_disable_signed_zeros(struct ac_llvm_context *ctx); -void -ac_llvm_add_target_dep_function_attr(LLVMValueRef F, - const char *name, unsigned value); +void ac_llvm_add_target_dep_function_attr(LLVMValueRef F, const char *name, unsigned value); void ac_llvm_set_workgroup_size(LLVMValueRef F, unsigned size); -static inline unsigned -ac_get_load_intr_attribs(bool can_speculate) +static inline unsigned ac_get_load_intr_attribs(bool can_speculate) { - /* READNONE means writes can't affect it, while READONLY means that - * writes can affect it. */ - return can_speculate ? AC_FUNC_ATTR_READNONE : - AC_FUNC_ATTR_READONLY; + /* READNONE means writes can't affect it, while READONLY means that + * writes can affect it. */ + return can_speculate ? AC_FUNC_ATTR_READNONE : AC_FUNC_ATTR_READONLY; } -unsigned -ac_count_scratch_private_memory(LLVMValueRef function); +unsigned ac_count_scratch_private_memory(LLVMValueRef function); LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple); void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info); void ac_init_shared_llvm_once(void); /* Do not use directly, use ac_init_llvm_once */ void ac_init_llvm_once(void); - -bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, - enum radeon_family family, - enum ac_target_machine_options tm_options); +bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, enum radeon_family family, + enum ac_target_machine_options tm_options); void ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler); struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm); void ac_destroy_llvm_passes(struct ac_compiler_passes *p); bool ac_compile_module_to_elf(struct ac_compiler_passes *p, LLVMModuleRef module, - char **pelf_buffer, size_t *pelf_size); + char **pelf_buffer, size_t *pelf_size); void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr); void ac_enable_global_isel(LLVMTargetMachineRef tm); -static inline bool -ac_has_vec3_support(enum chip_class chip, bool use_format) +static inline bool ac_has_vec3_support(enum chip_class chip, bool use_format) { - if (chip == GFX6 && !use_format) { - /* GFX6 only supports vec3 with load/store format. */ - return false; - } + if (chip == GFX6 && !use_format) { + /* GFX6 only supports vec3 with load/store format. */ + return false; + } - return LLVM_VERSION_MAJOR >= 9; + return LLVM_VERSION_MAJOR >= 9; } #ifdef __cplusplus diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 9ea32c70b66..85a3858f439 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -21,526 +21,475 @@ * IN THE SOFTWARE. */ -#include - #include "ac_nir_to_llvm.h" + +#include "ac_binary.h" #include "ac_llvm_build.h" #include "ac_llvm_util.h" -#include "ac_binary.h" -#include "sid.h" +#include "ac_shader_abi.h" +#include "ac_shader_util.h" #include "nir/nir.h" #include "nir/nir_deref.h" +#include "sid.h" #include "util/bitscan.h" #include "util/u_math.h" -#include "ac_shader_abi.h" -#include "ac_shader_util.h" +#include struct ac_nir_context { - struct ac_llvm_context ac; - struct ac_shader_abi *abi; - const struct ac_shader_args *args; + struct ac_llvm_context ac; + struct ac_shader_abi *abi; + const struct ac_shader_args *args; - gl_shader_stage stage; - shader_info *info; + gl_shader_stage stage; + shader_info *info; - LLVMValueRef *ssa_defs; + LLVMValueRef *ssa_defs; - LLVMValueRef scratch; - LLVMValueRef constant_data; + LLVMValueRef scratch; + LLVMValueRef constant_data; - struct hash_table *defs; - struct hash_table *phis; - struct hash_table *vars; - struct hash_table *verified_interp; + struct hash_table *defs; + struct hash_table *phis; + struct hash_table *vars; + struct hash_table *verified_interp; - LLVMValueRef main_function; - LLVMBasicBlockRef continue_block; - LLVMBasicBlockRef break_block; + LLVMValueRef main_function; + LLVMBasicBlockRef continue_block; + LLVMBasicBlockRef break_block; - int num_locals; - LLVMValueRef *locals; + int num_locals; + LLVMValueRef *locals; }; -static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - const nir_instr *instr, - bool image); - -static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - enum ac_descriptor_type desc_type, - const nir_instr *instr, - LLVMValueRef index, - bool image, bool write); - -static void -build_store_values_extended(struct ac_llvm_context *ac, - LLVMValueRef *values, - unsigned value_count, - unsigned value_stride, - LLVMValueRef vec) +static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr, + const nir_instr *instr, bool image); + +static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, nir_deref_instr *deref_instr, + enum ac_descriptor_type desc_type, const nir_instr *instr, + LLVMValueRef index, bool image, bool write); + +static void build_store_values_extended(struct ac_llvm_context *ac, LLVMValueRef *values, + unsigned value_count, unsigned value_stride, + LLVMValueRef vec) { - LLVMBuilderRef builder = ac->builder; - unsigned i; - - for (i = 0; i < value_count; i++) { - LLVMValueRef ptr = values[i * value_stride]; - LLVMValueRef index = LLVMConstInt(ac->i32, i, false); - LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, ""); - LLVMBuildStore(builder, value, ptr); - } + LLVMBuilderRef builder = ac->builder; + unsigned i; + + for (i = 0; i < value_count; i++) { + LLVMValueRef ptr = values[i * value_stride]; + LLVMValueRef index = LLVMConstInt(ac->i32, i, false); + LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, ""); + LLVMBuildStore(builder, value, ptr); + } } -static LLVMTypeRef get_def_type(struct ac_nir_context *ctx, - const nir_ssa_def *def) +static LLVMTypeRef get_def_type(struct ac_nir_context *ctx, const nir_ssa_def *def) { - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size); - if (def->num_components > 1) { - type = LLVMVectorType(type, def->num_components); - } - return type; + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size); + if (def->num_components > 1) { + type = LLVMVectorType(type, def->num_components); + } + return type; } static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src) { - assert(src.is_ssa); - return nir->ssa_defs[src.ssa->index]; + assert(src.is_ssa); + return nir->ssa_defs[src.ssa->index]; } -static LLVMValueRef -get_memory_ptr(struct ac_nir_context *ctx, nir_src src, unsigned bit_size) +static LLVMValueRef get_memory_ptr(struct ac_nir_context *ctx, nir_src src, unsigned bit_size) { - LLVMValueRef ptr = get_src(ctx, src); - ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ac.lds, &ptr, 1, ""); - int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + LLVMValueRef ptr = get_src(ctx, src); + ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ac.lds, &ptr, 1, ""); + int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, bit_size); + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, bit_size); - return LLVMBuildBitCast(ctx->ac.builder, ptr, - LLVMPointerType(type, addr_space), ""); + return LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(type, addr_space), ""); } -static LLVMBasicBlockRef get_block(struct ac_nir_context *nir, - const struct nir_block *b) +static LLVMBasicBlockRef get_block(struct ac_nir_context *nir, const struct nir_block *b) { - struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b); - return (LLVMBasicBlockRef)entry->data; + struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b); + return (LLVMBasicBlockRef)entry->data; } -static LLVMValueRef get_alu_src(struct ac_nir_context *ctx, - nir_alu_src src, +static LLVMValueRef get_alu_src(struct ac_nir_context *ctx, nir_alu_src src, unsigned num_components) { - LLVMValueRef value = get_src(ctx, src.src); - bool need_swizzle = false; - - assert(value); - unsigned src_components = ac_get_llvm_num_components(value); - for (unsigned i = 0; i < num_components; ++i) { - assert(src.swizzle[i] < src_components); - if (src.swizzle[i] != i) - need_swizzle = true; - } - - if (need_swizzle || num_components != src_components) { - LLVMValueRef masks[] = { - LLVMConstInt(ctx->ac.i32, src.swizzle[0], false), - LLVMConstInt(ctx->ac.i32, src.swizzle[1], false), - LLVMConstInt(ctx->ac.i32, src.swizzle[2], false), - LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)}; - - if (src_components > 1 && num_components == 1) { - value = LLVMBuildExtractElement(ctx->ac.builder, value, - masks[0], ""); - } else if (src_components == 1 && num_components > 1) { - LLVMValueRef values[] = {value, value, value, value}; - value = ac_build_gather_values(&ctx->ac, values, num_components); - } else { - LLVMValueRef swizzle = LLVMConstVector(masks, num_components); - value = LLVMBuildShuffleVector(ctx->ac.builder, value, value, - swizzle, ""); - } - } - assert(!src.negate); - assert(!src.abs); - return value; + LLVMValueRef value = get_src(ctx, src.src); + bool need_swizzle = false; + + assert(value); + unsigned src_components = ac_get_llvm_num_components(value); + for (unsigned i = 0; i < num_components; ++i) { + assert(src.swizzle[i] < src_components); + if (src.swizzle[i] != i) + need_swizzle = true; + } + + if (need_swizzle || num_components != src_components) { + LLVMValueRef masks[] = {LLVMConstInt(ctx->ac.i32, src.swizzle[0], false), + LLVMConstInt(ctx->ac.i32, src.swizzle[1], false), + LLVMConstInt(ctx->ac.i32, src.swizzle[2], false), + LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)}; + + if (src_components > 1 && num_components == 1) { + value = LLVMBuildExtractElement(ctx->ac.builder, value, masks[0], ""); + } else if (src_components == 1 && num_components > 1) { + LLVMValueRef values[] = {value, value, value, value}; + value = ac_build_gather_values(&ctx->ac, values, num_components); + } else { + LLVMValueRef swizzle = LLVMConstVector(masks, num_components); + value = LLVMBuildShuffleVector(ctx->ac.builder, value, value, swizzle, ""); + } + } + assert(!src.negate); + assert(!src.abs); + return value; } -static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx, - LLVMIntPredicate pred, LLVMValueRef src0, - LLVMValueRef src1) +static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx, LLVMIntPredicate pred, + LLVMValueRef src0, LLVMValueRef src1) { - LLVMTypeRef src0_type = LLVMTypeOf(src0); - LLVMTypeRef src1_type = LLVMTypeOf(src1); - - if (LLVMGetTypeKind(src0_type) == LLVMPointerTypeKind && - LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) { - src1 = LLVMBuildIntToPtr(ctx->builder, src1, src0_type, ""); - } else if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind && - LLVMGetTypeKind(src0_type) != LLVMPointerTypeKind) { - src0 = LLVMBuildIntToPtr(ctx->builder, src0, src1_type, ""); - } - - LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, ""); - return LLVMBuildSelect(ctx->builder, result, - LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), - ctx->i32_0, ""); + LLVMTypeRef src0_type = LLVMTypeOf(src0); + LLVMTypeRef src1_type = LLVMTypeOf(src1); + + if (LLVMGetTypeKind(src0_type) == LLVMPointerTypeKind && + LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) { + src1 = LLVMBuildIntToPtr(ctx->builder, src1, src0_type, ""); + } else if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind && + LLVMGetTypeKind(src0_type) != LLVMPointerTypeKind) { + src0 = LLVMBuildIntToPtr(ctx->builder, src0, src1_type, ""); + } + + LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, ""); + return LLVMBuildSelect(ctx->builder, result, LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), + ctx->i32_0, ""); } -static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx, - LLVMRealPredicate pred, LLVMValueRef src0, - LLVMValueRef src1) +static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx, LLVMRealPredicate pred, + LLVMValueRef src0, LLVMValueRef src1) { - LLVMValueRef result; - src0 = ac_to_float(ctx, src0); - src1 = ac_to_float(ctx, src1); - result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, ""); - return LLVMBuildSelect(ctx->builder, result, - LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), - ctx->i32_0, ""); + LLVMValueRef result; + src0 = ac_to_float(ctx, src0); + src1 = ac_to_float(ctx, src1); + result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, ""); + return LLVMBuildSelect(ctx->builder, result, LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), + ctx->i32_0, ""); } -static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0) +static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx, const char *intrin, + LLVMTypeRef result_type, LLVMValueRef src0) { - char name[64], type[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, src0), - }; - - ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); - ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); - assert(length < sizeof(name)); - return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE); + char name[64], type[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, src0), + }; + + ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); + ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); + assert(length < sizeof(name)); + return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE); } -static LLVMValueRef emit_intrin_1f_param_scalar(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0) +static LLVMValueRef emit_intrin_1f_param_scalar(struct ac_llvm_context *ctx, const char *intrin, + LLVMTypeRef result_type, LLVMValueRef src0) { - if (LLVMGetTypeKind(result_type) != LLVMVectorTypeKind) - return emit_intrin_1f_param(ctx, intrin, result_type, src0); - - LLVMTypeRef elem_type = LLVMGetElementType(result_type); - LLVMValueRef ret = LLVMGetUndef(result_type); - - /* Scalarize the intrinsic, because vectors are not supported. */ - for (unsigned i = 0; i < LLVMGetVectorSize(result_type); i++) { - char name[64], type[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, ac_llvm_extract_elem(ctx, src0, i)), - }; - - ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); - ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); - assert(length < sizeof(name)); - ret = LLVMBuildInsertElement(ctx->builder, ret, - ac_build_intrinsic(ctx, name, elem_type, params, - 1, AC_FUNC_ATTR_READNONE), - LLVMConstInt(ctx->i32, i, 0), ""); - } - return ret; + if (LLVMGetTypeKind(result_type) != LLVMVectorTypeKind) + return emit_intrin_1f_param(ctx, intrin, result_type, src0); + + LLVMTypeRef elem_type = LLVMGetElementType(result_type); + LLVMValueRef ret = LLVMGetUndef(result_type); + + /* Scalarize the intrinsic, because vectors are not supported. */ + for (unsigned i = 0; i < LLVMGetVectorSize(result_type); i++) { + char name[64], type[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, ac_llvm_extract_elem(ctx, src0, i)), + }; + + ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); + ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); + assert(length < sizeof(name)); + ret = LLVMBuildInsertElement( + ctx->builder, ret, + ac_build_intrinsic(ctx, name, elem_type, params, 1, AC_FUNC_ATTR_READNONE), + LLVMConstInt(ctx->i32, i, 0), ""); + } + return ret; } -static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0, LLVMValueRef src1) +static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx, const char *intrin, + LLVMTypeRef result_type, LLVMValueRef src0, + LLVMValueRef src1) { - char name[64], type[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, src0), - ac_to_float(ctx, src1), - }; - - ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); - ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); - assert(length < sizeof(name)); - return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE); + char name[64], type[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, src0), + ac_to_float(ctx, src1), + }; + + ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); + ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); + assert(length < sizeof(name)); + return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE); } -static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx, - const char *intrin, - LLVMTypeRef result_type, - LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) +static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx, const char *intrin, + LLVMTypeRef result_type, LLVMValueRef src0, + LLVMValueRef src1, LLVMValueRef src2) { - char name[64], type[64]; - LLVMValueRef params[] = { - ac_to_float(ctx, src0), - ac_to_float(ctx, src1), - ac_to_float(ctx, src2), - }; - - ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); - ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); - assert(length < sizeof(name)); - return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE); + char name[64], type[64]; + LLVMValueRef params[] = { + ac_to_float(ctx, src0), + ac_to_float(ctx, src1), + ac_to_float(ctx, src2), + }; + + ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type)); + ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type); + assert(length < sizeof(name)); + return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE); } -static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2) +static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef src1, + LLVMValueRef src2) { - LLVMTypeRef src1_type = LLVMTypeOf(src1); - LLVMTypeRef src2_type = LLVMTypeOf(src2); - - if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind && - LLVMGetTypeKind(src2_type) != LLVMPointerTypeKind) { - src2 = LLVMBuildIntToPtr(ctx->builder, src2, src1_type, ""); - } else if (LLVMGetTypeKind(src2_type) == LLVMPointerTypeKind && - LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) { - src1 = LLVMBuildIntToPtr(ctx->builder, src1, src2_type, ""); - } - - LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, - LLVMConstNull(LLVMTypeOf(src0)), ""); - return LLVMBuildSelect(ctx->builder, v, - ac_to_integer_or_pointer(ctx, src1), - ac_to_integer_or_pointer(ctx, src2), ""); + LLVMTypeRef src1_type = LLVMTypeOf(src1); + LLVMTypeRef src2_type = LLVMTypeOf(src2); + + if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind && + LLVMGetTypeKind(src2_type) != LLVMPointerTypeKind) { + src2 = LLVMBuildIntToPtr(ctx->builder, src2, src1_type, ""); + } else if (LLVMGetTypeKind(src2_type) == LLVMPointerTypeKind && + LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) { + src1 = LLVMBuildIntToPtr(ctx->builder, src1, src2_type, ""); + } + + LLVMValueRef v = + LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, LLVMConstNull(LLVMTypeOf(src0)), ""); + return LLVMBuildSelect(ctx->builder, v, ac_to_integer_or_pointer(ctx, src1), + ac_to_integer_or_pointer(ctx, src2), ""); } -static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx, - LLVMValueRef src0) +static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx, LLVMValueRef src0) { - return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, "")); + return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, "")); } -static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx, - const char *intrin, - LLVMValueRef src0, LLVMValueRef src1) +static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx, const char *intrin, + LLVMValueRef src0, LLVMValueRef src1) { - LLVMTypeRef ret_type; - LLVMTypeRef types[] = { ctx->i32, ctx->i1 }; - LLVMValueRef res; - LLVMValueRef params[] = { src0, src1 }; - ret_type = LLVMStructTypeInContext(ctx->context, types, - 2, true); - - res = ac_build_intrinsic(ctx, intrin, ret_type, - params, 2, AC_FUNC_ATTR_READNONE); - - res = LLVMBuildExtractValue(ctx->builder, res, 1, ""); - res = LLVMBuildZExt(ctx->builder, res, ctx->i32, ""); - return res; + LLVMTypeRef ret_type; + LLVMTypeRef types[] = {ctx->i32, ctx->i1}; + LLVMValueRef res; + LLVMValueRef params[] = {src0, src1}; + ret_type = LLVMStructTypeInContext(ctx->context, types, 2, true); + + res = ac_build_intrinsic(ctx, intrin, ret_type, params, 2, AC_FUNC_ATTR_READNONE); + + res = LLVMBuildExtractValue(ctx->builder, res, 1, ""); + res = LLVMBuildZExt(ctx->builder, res, ctx->i32, ""); + return res; } -static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, - LLVMValueRef src0, - unsigned bitsize) +static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - assert(ac_get_elem_bits(ctx, LLVMTypeOf(src0)) == 32); - LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, - ac_const_uint_vec(ctx, LLVMTypeOf(src0), 0x3f800000), - ""); - result = ac_to_float(ctx, result); - - switch (bitsize) { - case 16: { - bool vec2 = LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind; - return LLVMBuildFPTrunc(ctx->builder, result, vec2 ? ctx->v2f16 : ctx->f16, ""); - } - case 32: - return result; - case 64: - return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); - default: - unreachable("Unsupported bit size."); - } + assert(ac_get_elem_bits(ctx, LLVMTypeOf(src0)) == 32); + LLVMValueRef result = + LLVMBuildAnd(ctx->builder, src0, ac_const_uint_vec(ctx, LLVMTypeOf(src0), 0x3f800000), ""); + result = ac_to_float(ctx, result); + + switch (bitsize) { + case 16: { + bool vec2 = LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind; + return LLVMBuildFPTrunc(ctx->builder, result, vec2 ? ctx->v2f16 : ctx->f16, ""); + } + case 32: + return result; + case 64: + return LLVMBuildFPExt(ctx->builder, result, ctx->f64, ""); + default: + unreachable("Unsupported bit size."); + } } -static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, - LLVMValueRef src0) +static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, LLVMValueRef src0) { - src0 = ac_to_float(ctx, src0); - LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); - return LLVMBuildSExt(ctx->builder, - LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, ""), - ctx->i32, ""); + src0 = ac_to_float(ctx, src0); + LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); + return LLVMBuildSExt(ctx->builder, LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, ""), + ctx->i32, ""); } -static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, - LLVMValueRef src0, - unsigned bitsize) +static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize) { - LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); - - switch (bitsize) { - case 8: - return LLVMBuildTrunc(ctx->builder, result, ctx->i8, ""); - case 16: - return LLVMBuildTrunc(ctx->builder, result, ctx->i16, ""); - case 32: - return result; - case 64: - return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); - default: - unreachable("Unsupported bit size."); - } + LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, ""); + + switch (bitsize) { + case 8: + return LLVMBuildTrunc(ctx->builder, result, ctx->i8, ""); + case 16: + return LLVMBuildTrunc(ctx->builder, result, ctx->i16, ""); + case 32: + return result; + case 64: + return LLVMBuildZExt(ctx->builder, result, ctx->i64, ""); + default: + unreachable("Unsupported bit size."); + } } -static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, - LLVMValueRef src0) +static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); - return LLVMBuildSExt(ctx->builder, - LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, ""), - ctx->i32, ""); + LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0)); + return LLVMBuildSExt(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, ""), + ctx->i32, ""); } -static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx, - LLVMValueRef src0) +static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef result; - LLVMValueRef cond = NULL; - - src0 = ac_to_float(ctx, src0); - result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, ""); - - if (ctx->chip_class >= GFX8) { - LLVMValueRef args[2]; - /* Check if the result is a denormal - and flush to 0 if so. */ - args[0] = result; - args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false); - cond = ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE); - } - - /* need to convert back up to f32 */ - result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, ""); - - if (ctx->chip_class >= GFX8) - result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); - else { - /* for GFX6-GFX7 */ - /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, - * so compare the result and flush to 0 if it's smaller. - */ - LLVMValueRef temp, cond2; - temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result); - cond = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, - LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""), - temp, ""); - cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealONE, - temp, ctx->f32_0, ""); - cond = LLVMBuildAnd(ctx->builder, cond, cond2, ""); - result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); - } - return result; + LLVMValueRef result; + LLVMValueRef cond = NULL; + + src0 = ac_to_float(ctx, src0); + result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, ""); + + if (ctx->chip_class >= GFX8) { + LLVMValueRef args[2]; + /* Check if the result is a denormal - and flush to 0 if so. */ + args[0] = result; + args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false); + cond = + ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE); + } + + /* need to convert back up to f32 */ + result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, ""); + + if (ctx->chip_class >= GFX8) + result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); + else { + /* for GFX6-GFX7 */ + /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, + * so compare the result and flush to 0 if it's smaller. + */ + LLVMValueRef temp, cond2; + temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result); + cond = LLVMBuildFCmp( + ctx->builder, LLVMRealOGT, + LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""), + temp, ""); + cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealONE, temp, ctx->f32_0, ""); + cond = LLVMBuildAnd(ctx->builder, cond, cond2, ""); + result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, ""); + } + return result; } -static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1) +static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx, LLVMValueRef src0, + LLVMValueRef src1) { - LLVMValueRef dst64, result; - src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, ""); - src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, ""); - - dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); - dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); - result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); - return result; + LLVMValueRef dst64, result; + src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, ""); + src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, ""); + + dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); + dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); + result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); + return result; } -static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx, - LLVMValueRef src0, LLVMValueRef src1) +static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx, LLVMValueRef src0, + LLVMValueRef src1) { - LLVMValueRef dst64, result; - src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, ""); - src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, ""); - - dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); - dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); - result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); - return result; + LLVMValueRef dst64, result; + src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, ""); + src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, ""); + + dst64 = LLVMBuildMul(ctx->builder, src0, src1, ""); + dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), ""); + result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, ""); + return result; } -static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx, - LLVMValueRef bits, LLVMValueRef offset) +static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx, LLVMValueRef bits, LLVMValueRef offset) { - /* mask = ((1 << bits) - 1) << offset */ - return LLVMBuildShl(ctx->builder, - LLVMBuildSub(ctx->builder, - LLVMBuildShl(ctx->builder, - ctx->i32_1, - bits, ""), - ctx->i32_1, ""), - offset, ""); + /* mask = ((1 << bits) - 1) << offset */ + return LLVMBuildShl( + ctx->builder, + LLVMBuildSub(ctx->builder, LLVMBuildShl(ctx->builder, ctx->i32_1, bits, ""), ctx->i32_1, ""), + offset, ""); } -static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx, - LLVMValueRef mask, LLVMValueRef insert, - LLVMValueRef base) +static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx, LLVMValueRef mask, + LLVMValueRef insert, LLVMValueRef base) { - /* Calculate: - * (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base)) - * Use the right-hand side, which the LLVM backend can convert to V_BFI. - */ - return LLVMBuildXor(ctx->builder, base, - LLVMBuildAnd(ctx->builder, mask, - LLVMBuildXor(ctx->builder, insert, base, ""), ""), ""); + /* Calculate: + * (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base)) + * Use the right-hand side, which the LLVM backend can convert to V_BFI. + */ + return LLVMBuildXor( + ctx->builder, base, + LLVMBuildAnd(ctx->builder, mask, LLVMBuildXor(ctx->builder, insert, base, ""), ""), ""); } -static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx, - LLVMValueRef src0, - LLVMValueRef (*pack)(struct ac_llvm_context *ctx, - LLVMValueRef args[2])) +static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx, LLVMValueRef src0, + LLVMValueRef (*pack)(struct ac_llvm_context *ctx, + LLVMValueRef args[2])) { - LLVMValueRef comp[2]; + LLVMValueRef comp[2]; - src0 = ac_to_float(ctx, src0); - comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, ""); - comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, ""); + src0 = ac_to_float(ctx, src0); + comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, ""); + comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, ""); - return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, ""); + return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, ""); } -static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx, - LLVMValueRef src0) +static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx, LLVMValueRef src0) { - LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); - LLVMValueRef temps[2], val; - int i; - - for (i = 0; i < 2; i++) { - val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0; - val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, ""); - val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, ""); - temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, ""); - } - return ac_build_gather_values(ctx, temps, 2); + LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false); + LLVMValueRef temps[2], val; + int i; + + for (i = 0; i < 2; i++) { + val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0; + val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, ""); + val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, ""); + temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, ""); + } + return ac_build_gather_values(ctx, temps, 2); } -static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, - nir_op op, - LLVMValueRef src0) +static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, nir_op op, LLVMValueRef src0) { - unsigned mask; - int idx; - LLVMValueRef result; - - if (op == nir_op_fddx_fine) - mask = AC_TID_MASK_LEFT; - else if (op == nir_op_fddy_fine) - mask = AC_TID_MASK_TOP; - else - mask = AC_TID_MASK_TOP_LEFT; - - /* for DDX we want to next X pixel, DDY next Y pixel. */ - if (op == nir_op_fddx_fine || - op == nir_op_fddx_coarse || - op == nir_op_fddx) - idx = 1; - else - idx = 2; - - result = ac_build_ddxy(&ctx->ac, mask, idx, src0); - return result; + unsigned mask; + int idx; + LLVMValueRef result; + + if (op == nir_op_fddx_fine) + mask = AC_TID_MASK_LEFT; + else if (op == nir_op_fddy_fine) + mask = AC_TID_MASK_TOP; + else + mask = AC_TID_MASK_TOP_LEFT; + + /* for DDX we want to next X pixel, DDY next Y pixel. */ + if (op == nir_op_fddx_fine || op == nir_op_fddx_coarse || op == nir_op_fddx) + idx = 1; + else + idx = 2; + + result = ac_build_ddxy(&ctx->ac, mask, idx, src0); + return result; } struct waterfall_context { - LLVMBasicBlockRef phi_bb[2]; - bool use_waterfall; + LLVMBasicBlockRef phi_bb[2]; + bool use_waterfall; }; /* To deal with divergent descriptors we can create a loop that handles all @@ -549,7 +498,7 @@ struct waterfall_context { * * These helper create the begin and end of the loop leaving the caller * to implement the body. - * + * * params: * - ctx is the usal nir context * - wctx is a temporary struct containing some loop info. Can be left uninitialized. @@ -557,719 +506,688 @@ struct waterfall_context { * - divergent is whether value is actually divergent. If false we just pass * things through. */ -static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx, - struct waterfall_context *wctx, - LLVMValueRef value, bool divergent) +static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx, struct waterfall_context *wctx, + LLVMValueRef value, bool divergent) { - /* If the app claims the value is divergent but it is constant we can - * end up with a dynamic index of NULL. */ - if (!value) - divergent = false; + /* If the app claims the value is divergent but it is constant we can + * end up with a dynamic index of NULL. */ + if (!value) + divergent = false; - wctx->use_waterfall = divergent; - if (!divergent) - return value; + wctx->use_waterfall = divergent; + if (!divergent) + return value; - ac_build_bgnloop(&ctx->ac, 6000); + ac_build_bgnloop(&ctx->ac, 6000); - LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL); + LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL); - LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value, - scalar_value, "uniform_active"); + LLVMValueRef active = + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value, scalar_value, "uniform_active"); - wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder); - ac_build_ifcc(&ctx->ac, active, 6001); + wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder); + ac_build_ifcc(&ctx->ac, active, 6001); - return scalar_value; + return scalar_value; } -static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx, - struct waterfall_context *wctx, - LLVMValueRef value) +static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx, struct waterfall_context *wctx, + LLVMValueRef value) { - LLVMValueRef ret = NULL; - LLVMValueRef phi_src[2]; - LLVMValueRef cc_phi_src[2] = { - LLVMConstInt(ctx->ac.i32, 0, false), - LLVMConstInt(ctx->ac.i32, 0xffffffff, false), - }; - - if (!wctx->use_waterfall) - return value; - - wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder); - - ac_build_endif(&ctx->ac, 6001); - - if (value) { - phi_src[0] = LLVMGetUndef(LLVMTypeOf(value)); - phi_src[1] = value; - - ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb); - } - - /* - * By using the optimization barrier on the exit decision, we decouple - * the operations from the break, and hence avoid LLVM hoisting the - * opteration into the break block. - */ - LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb); - ac_build_optimization_barrier(&ctx->ac, &cc); - - LLVMValueRef active = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2"); - ac_build_ifcc(&ctx->ac, active, 6002); - ac_build_break(&ctx->ac); - ac_build_endif(&ctx->ac, 6002); - - ac_build_endloop(&ctx->ac, 6000); - return ret; + LLVMValueRef ret = NULL; + LLVMValueRef phi_src[2]; + LLVMValueRef cc_phi_src[2] = { + LLVMConstInt(ctx->ac.i32, 0, false), + LLVMConstInt(ctx->ac.i32, 0xffffffff, false), + }; + + if (!wctx->use_waterfall) + return value; + + wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder); + + ac_build_endif(&ctx->ac, 6001); + + if (value) { + phi_src[0] = LLVMGetUndef(LLVMTypeOf(value)); + phi_src[1] = value; + + ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb); + } + + /* + * By using the optimization barrier on the exit decision, we decouple + * the operations from the break, and hence avoid LLVM hoisting the + * opteration into the break block. + */ + LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb); + ac_build_optimization_barrier(&ctx->ac, &cc); + + LLVMValueRef active = + LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2"); + ac_build_ifcc(&ctx->ac, active, 6002); + ac_build_break(&ctx->ac); + ac_build_endif(&ctx->ac, 6002); + + ac_build_endloop(&ctx->ac, 6000); + return ret; } static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) { - LLVMValueRef src[4], result = NULL; - unsigned num_components = instr->dest.dest.ssa.num_components; - unsigned src_components; - LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa); - - assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src)); - switch (instr->op) { - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - src_components = 1; - break; - case nir_op_pack_half_2x16: - case nir_op_pack_snorm_2x16: - case nir_op_pack_unorm_2x16: - src_components = 2; - break; - case nir_op_unpack_half_2x16: - src_components = 1; - break; - case nir_op_cube_face_coord: - case nir_op_cube_face_index: - src_components = 3; - break; - default: - src_components = num_components; - break; - } - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - src[i] = get_alu_src(ctx, instr->src[i], src_components); - - switch (instr->op) { - case nir_op_mov: - result = src[0]; - break; - case nir_op_fneg: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = LLVMBuildFNeg(ctx->ac.builder, src[0], ""); - if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) { - /* fneg will be optimized by backend compiler with sign - * bit removed via XOR. This is probably a LLVM bug. - */ - result = ac_build_canonicalize(&ctx->ac, result, - instr->dest.dest.ssa.bit_size); - } - break; - case nir_op_ineg: - result = LLVMBuildNeg(ctx->ac.builder, src[0], ""); - break; - case nir_op_inot: - result = LLVMBuildNot(ctx->ac.builder, src[0], ""); - break; - case nir_op_iadd: - result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fadd: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fsub: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_isub: - result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_imul: - result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_imod: - result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_umod: - result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_irem: - result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_idiv: - result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_udiv: - result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_fmul: - src[0] = ac_to_float(&ctx->ac, src[0]); - src[1] = ac_to_float(&ctx->ac, src[1]); - result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_frcp: - /* For doubles, we need precise division to pass GLCTS. */ - if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && - ac_get_type_size(def_type) == 8) { - result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1, - ac_to_float(&ctx->ac, src[0]), ""); - } else { - result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rcp", - ac_to_float_type(&ctx->ac, def_type), src[0]); - } - if (ctx->abi->clamp_div_by_zero) - result = ac_build_fmin(&ctx->ac, result, - LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX)); - break; - case nir_op_iand: - result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ior: - result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ixor: - result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ishl: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ishr: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ushr: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) - src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], - LLVMTypeOf(src[0]), ""); - result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], ""); - break; - case nir_op_ilt32: - result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]); - break; - case nir_op_ine32: - result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]); - break; - case nir_op_ieq32: - result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]); - break; - case nir_op_ige32: - result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]); - break; - case nir_op_ult32: - result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]); - break; - case nir_op_uge32: - result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]); - break; - case nir_op_feq32: - result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]); - break; - case nir_op_fneu32: - result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]); - break; - case nir_op_flt32: - result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]); - break; - case nir_op_fge32: - result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]); - break; - case nir_op_fabs: - result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs", - ac_to_float_type(&ctx->ac, def_type), src[0]); - if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) { - /* fabs will be optimized by backend compiler with sign - * bit removed via AND. - */ - result = ac_build_canonicalize(&ctx->ac, result, - instr->dest.dest.ssa.bit_size); - } - break; - case nir_op_iabs: - result = emit_iabs(&ctx->ac, src[0]); - break; - case nir_op_imax: - result = ac_build_imax(&ctx->ac, src[0], src[1]); - break; - case nir_op_imin: - result = ac_build_imin(&ctx->ac, src[0], src[1]); - break; - case nir_op_umax: - result = ac_build_umax(&ctx->ac, src[0], src[1]); - break; - case nir_op_umin: - result = ac_build_umin(&ctx->ac, src[0], src[1]); - break; - case nir_op_isign: - result = ac_build_isign(&ctx->ac, src[0]); - break; - case nir_op_fsign: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fsign(&ctx->ac, src[0]); - break; - case nir_op_ffloor: - result = emit_intrin_1f_param(&ctx->ac, "llvm.floor", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_ftrunc: - result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fceil: - result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fround_even: - result = emit_intrin_1f_param(&ctx->ac, "llvm.rint", - ac_to_float_type(&ctx->ac, def_type),src[0]); - break; - case nir_op_ffract: - result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.fract", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fsin: - result = emit_intrin_1f_param(&ctx->ac, "llvm.sin", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fcos: - result = emit_intrin_1f_param(&ctx->ac, "llvm.cos", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fsqrt: - result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_fexp2: - result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_flog2: - result = emit_intrin_1f_param(&ctx->ac, "llvm.log2", - ac_to_float_type(&ctx->ac, def_type), src[0]); - break; - case nir_op_frsq: - result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rsq", - ac_to_float_type(&ctx->ac, def_type), src[0]); - if (ctx->abi->clamp_div_by_zero) - result = ac_build_fmin(&ctx->ac, result, - LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX)); - break; - case nir_op_frexp_exp: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_frexp_exp(&ctx->ac, src[0], - ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16) - result = LLVMBuildSExt(ctx->ac.builder, result, - ctx->ac.i32, ""); - break; - case nir_op_frexp_sig: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_frexp_mant(&ctx->ac, src[0], - instr->dest.dest.ssa.bit_size); - break; - case nir_op_fpow: - result = emit_intrin_2f_param(&ctx->ac, "llvm.pow", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - break; - case nir_op_fmax: - result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - if (ctx->ac.chip_class < GFX9 && - instr->dest.dest.ssa.bit_size == 32) { - /* Only pre-GFX9 chips do not flush denorms. */ - result = ac_build_canonicalize(&ctx->ac, result, - instr->dest.dest.ssa.bit_size); - } - break; - case nir_op_fmin: - result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1]); - if (ctx->ac.chip_class < GFX9 && - instr->dest.dest.ssa.bit_size == 32) { - /* Only pre-GFX9 chips do not flush denorms. */ - result = ac_build_canonicalize(&ctx->ac, result, - instr->dest.dest.ssa.bit_size); - } - break; - case nir_op_ffma: - /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ - result = emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]); - break; - case nir_op_ldexp: - src[0] = ac_to_float(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, def_type) == 32) - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE); - else if (ac_get_elem_bits(&ctx->ac, def_type) == 16) - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE); - else - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE); - break; - case nir_op_bfm: - result = emit_bfm(&ctx->ac, src[0], src[1]); - break; - case nir_op_bitfield_select: - result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]); - break; - case nir_op_ubfe: - result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false); - break; - case nir_op_ibfe: - result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true); - break; - case nir_op_bitfield_reverse: - result = ac_build_bitfield_reverse(&ctx->ac, src[0]); - break; - case nir_op_bit_count: - result = ac_build_bit_count(&ctx->ac, src[0]); - break; - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - src[i] = ac_to_integer(&ctx->ac, src[i]); - result = ac_build_gather_values(&ctx->ac, src, num_components); - break; - case nir_op_f2i8: - case nir_op_f2i16: - case nir_op_f2i32: - case nir_op_f2i64: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_f2u8: - case nir_op_f2u16: - case nir_op_f2u32: - case nir_op_f2u64: - src[0] = ac_to_float(&ctx->ac, src[0]); - result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_i2f16: - case nir_op_i2f32: - case nir_op_i2f64: - result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - break; - case nir_op_u2f16: - case nir_op_u2f32: - case nir_op_u2f64: - result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - break; - case nir_op_f2f16_rtz: - case nir_op_f2f16: - case nir_op_f2fmp: - src[0] = ac_to_float(&ctx->ac, src[0]); - - /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it, - * all f32->f16 conversions have to round towards zero, because both scalar - * and vec2 down-conversions have to round equally. - */ - if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || - instr->op == nir_op_f2f16_rtz) { - src[0] = ac_to_float(&ctx->ac, src[0]); - - if (LLVMTypeOf(src[0]) == ctx->ac.f64) - src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); - - /* Fast path conversion. This only works if NIR is vectorized - * to vec2 16. - */ - if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) { - LLVMValueRef args[] = { - ac_llvm_extract_elem(&ctx->ac, src[0], 0), - ac_llvm_extract_elem(&ctx->ac, src[0], 1), - }; - result = ac_build_cvt_pkrtz_f16(&ctx->ac, args); - break; - } - - assert(ac_get_llvm_num_components(src[0]) == 1); - LLVMValueRef param[2] = { src[0], LLVMGetUndef(ctx->ac.f32) }; - result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - } else { - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - else - result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - } - break; - case nir_op_f2f16_rtne: - case nir_op_f2f32: - case nir_op_f2f64: - src[0] = ac_to_float(&ctx->ac, src[0]); - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - else - result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - break; - case nir_op_u2u8: - case nir_op_u2u16: - case nir_op_u2ump: - case nir_op_u2u32: - case nir_op_u2u64: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, ""); - else - result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_i2i8: - case nir_op_i2i16: - case nir_op_i2imp: - case nir_op_i2i32: - case nir_op_i2i64: - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, ""); - else - result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); - break; - case nir_op_b32csel: - result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]); - break; - case nir_op_find_lsb: - result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]); - break; - case nir_op_ufind_msb: - result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32); - break; - case nir_op_ifind_msb: - result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32); - break; - case nir_op_uadd_carry: - result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]); - break; - case nir_op_usub_borrow: - result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]); - break; - case nir_op_b2f16: - case nir_op_b2f32: - case nir_op_b2f64: - result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); - break; - case nir_op_f2b32: - result = emit_f2b(&ctx->ac, src[0]); - break; - case nir_op_b2i8: - case nir_op_b2i16: - case nir_op_b2i32: - case nir_op_b2i64: - result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); - break; - case nir_op_i2b32: - result = emit_i2b(&ctx->ac, src[0]); - break; - case nir_op_fquantize2f16: - result = emit_f2f16(&ctx->ac, src[0]); - break; - case nir_op_umul_high: - result = emit_umul_high(&ctx->ac, src[0], src[1]); - break; - case nir_op_imul_high: - result = emit_imul_high(&ctx->ac, src[0], src[1]); - break; - case nir_op_pack_half_2x16: - result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16); - break; - case nir_op_pack_snorm_2x16: - result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16); - break; - case nir_op_pack_unorm_2x16: - result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16); - break; - case nir_op_unpack_half_2x16: - result = emit_unpack_half_2x16(&ctx->ac, src[0]); - break; - case nir_op_fddx: - case nir_op_fddy: - case nir_op_fddx_fine: - case nir_op_fddy_fine: - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - result = emit_ddxy(ctx, instr->op, src[0]); - break; - - case nir_op_unpack_64_2x32_split_x: { - assert(ac_get_llvm_num_components(src[0]) == 1); - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i32, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_0, ""); - break; - } - - case nir_op_unpack_64_2x32_split_y: { - assert(ac_get_llvm_num_components(src[0]) == 1); - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i32, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_1, ""); - break; - } - - case nir_op_pack_64_2x32_split: { - LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); - result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, ""); - break; - } - - case nir_op_pack_32_2x16_split: { - LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); - result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); - break; - } - - case nir_op_unpack_32_2x16_split_x: { - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i16, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_0, ""); - break; - } - - case nir_op_unpack_32_2x16_split_y: { - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], - ctx->ac.v2i16, - ""); - result = LLVMBuildExtractElement(ctx->ac.builder, tmp, - ctx->ac.i32_1, ""); - break; - } - - case nir_op_cube_face_coord: { - src[0] = ac_to_float(&ctx->ac, src[0]); - LLVMValueRef results[2]; - LLVMValueRef in[3]; - for (unsigned chan = 0; chan < 3; chan++) - in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); - results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - results[0] = ac_build_fdiv(&ctx->ac, results[0], ma); - results[1] = ac_build_fdiv(&ctx->ac, results[1], ma); - LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5); - results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, ""); - results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, ""); - result = ac_build_gather_values(&ctx->ac, results, 2); - break; - } - - case nir_op_cube_face_index: { - src[0] = ac_to_float(&ctx->ac, src[0]); - LLVMValueRef in[3]; - for (unsigned chan = 0; chan < 3; chan++) - in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", - ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - break; - } - - default: - fprintf(stderr, "Unknown NIR alu instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - abort(); - } - - if (result) { - assert(instr->dest.dest.is_ssa); - result = ac_to_integer_or_pointer(&ctx->ac, result); - ctx->ssa_defs[instr->dest.dest.ssa.index] = result; - } + LLVMValueRef src[4], result = NULL; + unsigned num_components = instr->dest.dest.ssa.num_components; + unsigned src_components; + LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa); + + assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src)); + switch (instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + src_components = 1; + break; + case nir_op_pack_half_2x16: + case nir_op_pack_snorm_2x16: + case nir_op_pack_unorm_2x16: + src_components = 2; + break; + case nir_op_unpack_half_2x16: + src_components = 1; + break; + case nir_op_cube_face_coord: + case nir_op_cube_face_index: + src_components = 3; + break; + default: + src_components = num_components; + break; + } + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + src[i] = get_alu_src(ctx, instr->src[i], src_components); + + switch (instr->op) { + case nir_op_mov: + result = src[0]; + break; + case nir_op_fneg: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = LLVMBuildFNeg(ctx->ac.builder, src[0], ""); + if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) { + /* fneg will be optimized by backend compiler with sign + * bit removed via XOR. This is probably a LLVM bug. + */ + result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_ineg: + result = LLVMBuildNeg(ctx->ac.builder, src[0], ""); + break; + case nir_op_inot: + result = LLVMBuildNot(ctx->ac.builder, src[0], ""); + break; + case nir_op_iadd: + result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fadd: + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fsub: + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_isub: + result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_imul: + result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_imod: + result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_umod: + result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_irem: + result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_idiv: + result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_udiv: + result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_fmul: + src[0] = ac_to_float(&ctx->ac, src[0]); + src[1] = ac_to_float(&ctx->ac, src[1]); + result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_frcp: + /* For doubles, we need precise division to pass GLCTS. */ + if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && ac_get_type_size(def_type) == 8) { + result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1, ac_to_float(&ctx->ac, src[0]), ""); + } else { + result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rcp", + ac_to_float_type(&ctx->ac, def_type), src[0]); + } + if (ctx->abi->clamp_div_by_zero) + result = ac_build_fmin(&ctx->ac, result, + LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX)); + break; + case nir_op_iand: + result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ior: + result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ixor: + result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ishl: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), ""); + else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), ""); + result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ishr: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), ""); + else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), ""); + result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ushr: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) < + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), ""); + else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) > + ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))) + src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), ""); + result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], ""); + break; + case nir_op_ilt32: + result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]); + break; + case nir_op_ine32: + result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]); + break; + case nir_op_ieq32: + result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]); + break; + case nir_op_ige32: + result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]); + break; + case nir_op_ult32: + result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]); + break; + case nir_op_uge32: + result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]); + break; + case nir_op_feq32: + result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]); + break; + case nir_op_fneu32: + result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]); + break; + case nir_op_flt32: + result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]); + break; + case nir_op_fge32: + result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]); + break; + case nir_op_fabs: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.fabs", ac_to_float_type(&ctx->ac, def_type), src[0]); + if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) { + /* fabs will be optimized by backend compiler with sign + * bit removed via AND. + */ + result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_iabs: + result = emit_iabs(&ctx->ac, src[0]); + break; + case nir_op_imax: + result = ac_build_imax(&ctx->ac, src[0], src[1]); + break; + case nir_op_imin: + result = ac_build_imin(&ctx->ac, src[0], src[1]); + break; + case nir_op_umax: + result = ac_build_umax(&ctx->ac, src[0], src[1]); + break; + case nir_op_umin: + result = ac_build_umin(&ctx->ac, src[0], src[1]); + break; + case nir_op_isign: + result = ac_build_isign(&ctx->ac, src[0]); + break; + case nir_op_fsign: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_fsign(&ctx->ac, src[0]); + break; + case nir_op_ffloor: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.floor", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_ftrunc: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.trunc", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fceil: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.ceil", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fround_even: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.rint", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_ffract: + result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.fract", + ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fsin: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.sin", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fcos: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.cos", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fsqrt: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_fexp2: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.exp2", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_flog2: + result = + emit_intrin_1f_param(&ctx->ac, "llvm.log2", ac_to_float_type(&ctx->ac, def_type), src[0]); + break; + case nir_op_frsq: + result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rsq", + ac_to_float_type(&ctx->ac, def_type), src[0]); + if (ctx->abi->clamp_div_by_zero) + result = ac_build_fmin(&ctx->ac, result, + LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX)); + break; + case nir_op_frexp_exp: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_frexp_exp(&ctx->ac, src[0], ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0]))); + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16) + result = LLVMBuildSExt(ctx->ac.builder, result, ctx->ac.i32, ""); + break; + case nir_op_frexp_sig: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = ac_build_frexp_mant(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); + break; + case nir_op_fpow: + result = emit_intrin_2f_param(&ctx->ac, "llvm.pow", ac_to_float_type(&ctx->ac, def_type), + src[0], src[1]); + break; + case nir_op_fmax: + result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", ac_to_float_type(&ctx->ac, def_type), + src[0], src[1]); + if (ctx->ac.chip_class < GFX9 && instr->dest.dest.ssa.bit_size == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ + result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_fmin: + result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", ac_to_float_type(&ctx->ac, def_type), + src[0], src[1]); + if (ctx->ac.chip_class < GFX9 && instr->dest.dest.ssa.bit_size == 32) { + /* Only pre-GFX9 chips do not flush denorms. */ + result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size); + } + break; + case nir_op_ffma: + /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ + result = + emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd", + ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]); + break; + case nir_op_ldexp: + src[0] = ac_to_float(&ctx->ac, src[0]); + if (ac_get_elem_bits(&ctx->ac, def_type) == 32) + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, + AC_FUNC_ATTR_READNONE); + else if (ac_get_elem_bits(&ctx->ac, def_type) == 16) + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, + AC_FUNC_ATTR_READNONE); + else + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, + AC_FUNC_ATTR_READNONE); + break; + case nir_op_bfm: + result = emit_bfm(&ctx->ac, src[0], src[1]); + break; + case nir_op_bitfield_select: + result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]); + break; + case nir_op_ubfe: + result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false); + break; + case nir_op_ibfe: + result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true); + break; + case nir_op_bitfield_reverse: + result = ac_build_bitfield_reverse(&ctx->ac, src[0]); + break; + case nir_op_bit_count: + result = ac_build_bit_count(&ctx->ac, src[0]); + break; + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) + src[i] = ac_to_integer(&ctx->ac, src[i]); + result = ac_build_gather_values(&ctx->ac, src, num_components); + break; + case nir_op_f2i8: + case nir_op_f2i16: + case nir_op_f2i32: + case nir_op_f2i64: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_f2u8: + case nir_op_f2u16: + case nir_op_f2u32: + case nir_op_f2u64: + src[0] = ac_to_float(&ctx->ac, src[0]); + result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_i2f16: + case nir_op_i2f32: + case nir_op_i2f64: + result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + break; + case nir_op_u2f16: + case nir_op_u2f32: + case nir_op_u2f64: + result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + break; + case nir_op_f2f16_rtz: + case nir_op_f2f16: + case nir_op_f2fmp: + src[0] = ac_to_float(&ctx->ac, src[0]); + + /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it, + * all f32->f16 conversions have to round towards zero, because both scalar + * and vec2 down-conversions have to round equally. + */ + if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || instr->op == nir_op_f2f16_rtz) { + src[0] = ac_to_float(&ctx->ac, src[0]); + + if (LLVMTypeOf(src[0]) == ctx->ac.f64) + src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); + + /* Fast path conversion. This only works if NIR is vectorized + * to vec2 16. + */ + if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) { + LLVMValueRef args[] = { + ac_llvm_extract_elem(&ctx->ac, src[0], 0), + ac_llvm_extract_elem(&ctx->ac, src[0], 1), + }; + result = ac_build_cvt_pkrtz_f16(&ctx->ac, args); + break; + } + + assert(ac_get_llvm_num_components(src[0]) == 1); + LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)}; + result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + } else { + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = + LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + else + result = + LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + } + break; + case nir_op_f2f16_rtne: + case nir_op_f2f32: + case nir_op_f2f64: + src[0] = ac_to_float(&ctx->ac, src[0]); + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + else + result = + LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + break; + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2ump: + case nir_op_u2u32: + case nir_op_u2u64: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, ""); + else + result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2imp: + case nir_op_i2i32: + case nir_op_i2i64: + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) + result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, ""); + else + result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, ""); + break; + case nir_op_b32csel: + result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]); + break; + case nir_op_find_lsb: + result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]); + break; + case nir_op_ufind_msb: + result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32); + break; + case nir_op_ifind_msb: + result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32); + break; + case nir_op_uadd_carry: + result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]); + break; + case nir_op_usub_borrow: + result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]); + break; + case nir_op_b2f16: + case nir_op_b2f32: + case nir_op_b2f64: + result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); + break; + case nir_op_f2b32: + result = emit_f2b(&ctx->ac, src[0]); + break; + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + case nir_op_b2i64: + result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size); + break; + case nir_op_i2b32: + result = emit_i2b(&ctx->ac, src[0]); + break; + case nir_op_fquantize2f16: + result = emit_f2f16(&ctx->ac, src[0]); + break; + case nir_op_umul_high: + result = emit_umul_high(&ctx->ac, src[0], src[1]); + break; + case nir_op_imul_high: + result = emit_imul_high(&ctx->ac, src[0], src[1]); + break; + case nir_op_pack_half_2x16: + result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16); + break; + case nir_op_pack_snorm_2x16: + result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16); + break; + case nir_op_pack_unorm_2x16: + result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16); + break; + case nir_op_unpack_half_2x16: + result = emit_unpack_half_2x16(&ctx->ac, src[0]); + break; + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + result = emit_ddxy(ctx, instr->op, src[0]); + break; + + case nir_op_unpack_64_2x32_split_x: { + assert(ac_get_llvm_num_components(src[0]) == 1); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i32, ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_64_2x32_split_y: { + assert(ac_get_llvm_num_components(src[0]) == 1); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i32, ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, ""); + break; + } + + case nir_op_pack_64_2x32_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, ""); + break; + } + + case nir_op_pack_32_2x16_split: { + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2); + result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + + case nir_op_unpack_32_2x16_split_x: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, ""); + break; + } + + case nir_op_unpack_32_2x16_split_y: { + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, ""); + result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, ""); + break; + } + + case nir_op_cube_face_coord: { + src[0] = ac_to_float(&ctx->ac, src[0]); + LLVMValueRef results[2]; + LLVMValueRef in[3]; + for (unsigned chan = 0; chan < 3; chan++) + in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); + results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3, + AC_FUNC_ATTR_READNONE); + results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3, + AC_FUNC_ATTR_READNONE); + LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", ctx->ac.f32, in, 3, + AC_FUNC_ATTR_READNONE); + results[0] = ac_build_fdiv(&ctx->ac, results[0], ma); + results[1] = ac_build_fdiv(&ctx->ac, results[1], ma); + LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5); + results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, ""); + results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, ""); + result = ac_build_gather_values(&ctx->ac, results, 2); + break; + } + + case nir_op_cube_face_index: { + src[0] = ac_to_float(&ctx->ac, src[0]); + LLVMValueRef in[3]; + for (unsigned chan = 0; chan < 3; chan++) + in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", ctx->ac.f32, in, 3, + AC_FUNC_ATTR_READNONE); + break; + } + + default: + fprintf(stderr, "Unknown NIR alu instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + if (result) { + assert(instr->dest.dest.is_ssa); + result = ac_to_integer_or_pointer(&ctx->ac, result); + ctx->ssa_defs[instr->dest.dest.ssa.index] = result; + } } -static void visit_load_const(struct ac_nir_context *ctx, - const nir_load_const_instr *instr) +static void visit_load_const(struct ac_nir_context *ctx, const nir_load_const_instr *instr) { - LLVMValueRef values[4], value = NULL; - LLVMTypeRef element_type = - LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); - - for (unsigned i = 0; i < instr->def.num_components; ++i) { - switch (instr->def.bit_size) { - case 8: - values[i] = LLVMConstInt(element_type, - instr->value[i].u8, false); - break; - case 16: - values[i] = LLVMConstInt(element_type, - instr->value[i].u16, false); - break; - case 32: - values[i] = LLVMConstInt(element_type, - instr->value[i].u32, false); - break; - case 64: - values[i] = LLVMConstInt(element_type, - instr->value[i].u64, false); - break; - default: - fprintf(stderr, - "unsupported nir load_const bit_size: %d\n", - instr->def.bit_size); - abort(); - } - } - if (instr->def.num_components > 1) { - value = LLVMConstVector(values, instr->def.num_components); - } else - value = values[0]; - - ctx->ssa_defs[instr->def.index] = value; + LLVMValueRef values[4], value = NULL; + LLVMTypeRef element_type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); + + for (unsigned i = 0; i < instr->def.num_components; ++i) { + switch (instr->def.bit_size) { + case 8: + values[i] = LLVMConstInt(element_type, instr->value[i].u8, false); + break; + case 16: + values[i] = LLVMConstInt(element_type, instr->value[i].u16, false); + break; + case 32: + values[i] = LLVMConstInt(element_type, instr->value[i].u32, false); + break; + case 64: + values[i] = LLVMConstInt(element_type, instr->value[i].u64, false); + break; + default: + fprintf(stderr, "unsupported nir load_const bit_size: %d\n", instr->def.bit_size); + abort(); + } + } + if (instr->def.num_components > 1) { + value = LLVMConstVector(values, instr->def.num_components); + } else + value = values[0]; + + ctx->ssa_defs[instr->def.index] = value; } -static LLVMValueRef -get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_elements) +static LLVMValueRef get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, + bool in_elements) { - LLVMValueRef size = - LLVMBuildExtractElement(ctx->ac.builder, descriptor, - LLVMConstInt(ctx->ac.i32, 2, false), ""); - - /* GFX8 only */ - if (ctx->ac.chip_class == GFX8 && in_elements) { - /* On GFX8, the descriptor contains the size in bytes, - * but TXQ must return the size in elements. - * The stride is always non-zero for resources using TXQ. - */ - LLVMValueRef stride = - LLVMBuildExtractElement(ctx->ac.builder, descriptor, - ctx->ac.i32_1, ""); - stride = LLVMBuildLShr(ctx->ac.builder, stride, - LLVMConstInt(ctx->ac.i32, 16, false), ""); - stride = LLVMBuildAnd(ctx->ac.builder, stride, - LLVMConstInt(ctx->ac.i32, 0x3fff, false), ""); - - size = LLVMBuildUDiv(ctx->ac.builder, size, stride, ""); - } - return size; + LLVMValueRef size = + LLVMBuildExtractElement(ctx->ac.builder, descriptor, LLVMConstInt(ctx->ac.i32, 2, false), ""); + + /* GFX8 only */ + if (ctx->ac.chip_class == GFX8 && in_elements) { + /* On GFX8, the descriptor contains the size in bytes, + * but TXQ must return the size in elements. + * The stride is always non-zero for resources using TXQ. + */ + LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, descriptor, ctx->ac.i32_1, ""); + stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, false), ""); + stride = LLVMBuildAnd(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 0x3fff, false), ""); + + size = LLVMBuildUDiv(ctx->ac.builder, size, stride, ""); + } + return size; } /* Gather4 should follow the same rules as bilinear filtering, but the hardware @@ -1287,3214 +1205,2970 @@ get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_ele * runtime. In this case, return an i1 value that indicates whether the * descriptor was overridden (and hence a fixup of the sampler result is needed). */ -static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx, - nir_variable *var, - struct ac_image_args *args, - const nir_tex_instr *instr) +static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx, nir_variable *var, + struct ac_image_args *args, const nir_tex_instr *instr) { - const struct glsl_type *type = glsl_without_array(var->type); - enum glsl_base_type stype = glsl_get_sampler_result_type(type); - LLVMValueRef wa_8888 = NULL; - LLVMValueRef half_texel[2]; - LLVMValueRef result; - - assert(stype == GLSL_TYPE_INT || stype == GLSL_TYPE_UINT); - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - LLVMValueRef formats; - LLVMValueRef data_format; - LLVMValueRef wa_formats; - - formats = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, ""); - - data_format = LLVMBuildLShr(ctx->builder, formats, - LLVMConstInt(ctx->i32, 20, false), ""); - data_format = LLVMBuildAnd(ctx->builder, data_format, - LLVMConstInt(ctx->i32, (1u << 6) - 1, false), ""); - wa_8888 = LLVMBuildICmp( - ctx->builder, LLVMIntEQ, data_format, - LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), - ""); - - uint32_t wa_num_format = - stype == GLSL_TYPE_UINT ? - S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) : - S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED); - wa_formats = LLVMBuildAnd(ctx->builder, formats, - LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), - ""); - wa_formats = LLVMBuildOr(ctx->builder, wa_formats, - LLVMConstInt(ctx->i32, wa_num_format, false), ""); - - formats = LLVMBuildSelect(ctx->builder, wa_8888, wa_formats, formats, ""); - args->resource = LLVMBuildInsertElement( - ctx->builder, args->resource, formats, ctx->i32_1, ""); - } - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { - assert(!wa_8888); - half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5); - } else { - struct ac_image_args resinfo = {}; - LLVMBasicBlockRef bbs[2]; - - LLVMValueRef unnorm = NULL; - LLVMValueRef default_offset = ctx->f32_0; - if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && - !instr->is_array) { - /* In vulkan, whether the sampler uses unnormalized - * coordinates or not is a dynamic property of the - * sampler. Hence, to figure out whether or not we - * need to divide by the texture size, we need to test - * the sampler at runtime. This tests the bit set by - * radv_init_sampler(). - */ - LLVMValueRef sampler0 = - LLVMBuildExtractElement(ctx->builder, args->sampler, ctx->i32_0, ""); - sampler0 = LLVMBuildLShr(ctx->builder, sampler0, - LLVMConstInt(ctx->i32, 15, false), ""); - sampler0 = LLVMBuildAnd(ctx->builder, sampler0, ctx->i32_1, ""); - unnorm = LLVMBuildICmp(ctx->builder, LLVMIntEQ, sampler0, ctx->i32_1, ""); - default_offset = LLVMConstReal(ctx->f32, -0.5); - } - - bbs[0] = LLVMGetInsertBlock(ctx->builder); - if (wa_8888 || unnorm) { - assert(!(wa_8888 && unnorm)); - LLVMValueRef not_needed = wa_8888 ? wa_8888 : unnorm; - /* Skip the texture size query entirely if we don't need it. */ - ac_build_ifcc(ctx, LLVMBuildNot(ctx->builder, not_needed, ""), 2000); - bbs[1] = LLVMGetInsertBlock(ctx->builder); - } - - /* Query the texture size. */ - resinfo.dim = ac_get_sampler_dim(ctx->chip_class, instr->sampler_dim, instr->is_array); - resinfo.opcode = ac_image_get_resinfo; - resinfo.dmask = 0xf; - resinfo.lod = ctx->i32_0; - resinfo.resource = args->resource; - resinfo.attributes = AC_FUNC_ATTR_READNONE; - LLVMValueRef size = ac_build_image_opcode(ctx, &resinfo); - - /* Compute -0.5 / size. */ - for (unsigned c = 0; c < 2; c++) { - half_texel[c] = - LLVMBuildExtractElement(ctx->builder, size, - LLVMConstInt(ctx->i32, c, 0), ""); - half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, ""); - half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]); - half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c], - LLVMConstReal(ctx->f32, -0.5), ""); - } - - if (wa_8888 || unnorm) { - ac_build_endif(ctx, 2000); - - for (unsigned c = 0; c < 2; c++) { - LLVMValueRef values[2] = { default_offset, half_texel[c] }; - half_texel[c] = ac_build_phi(ctx, ctx->f32, 2, - values, bbs); - } - } - } - - for (unsigned c = 0; c < 2; c++) { - LLVMValueRef tmp; - tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, ""); - args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], ""); - } - - args->attributes = AC_FUNC_ATTR_READNONE; - result = ac_build_image_opcode(ctx, args); - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - LLVMValueRef tmp, tmp2; - - /* if the cube workaround is in place, f2i the result. */ - for (unsigned c = 0; c < 4; c++) { - tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), ""); - if (stype == GLSL_TYPE_UINT) - tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, ""); - else - tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, ""); - tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, ""); - tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, ""); - tmp = LLVMBuildSelect(ctx->builder, wa_8888, tmp2, tmp, ""); - tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, ""); - result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), ""); - } - } - return result; + const struct glsl_type *type = glsl_without_array(var->type); + enum glsl_base_type stype = glsl_get_sampler_result_type(type); + LLVMValueRef wa_8888 = NULL; + LLVMValueRef half_texel[2]; + LLVMValueRef result; + + assert(stype == GLSL_TYPE_INT || stype == GLSL_TYPE_UINT); + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + LLVMValueRef formats; + LLVMValueRef data_format; + LLVMValueRef wa_formats; + + formats = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, ""); + + data_format = LLVMBuildLShr(ctx->builder, formats, LLVMConstInt(ctx->i32, 20, false), ""); + data_format = + LLVMBuildAnd(ctx->builder, data_format, LLVMConstInt(ctx->i32, (1u << 6) - 1, false), ""); + wa_8888 = LLVMBuildICmp(ctx->builder, LLVMIntEQ, data_format, + LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), ""); + + uint32_t wa_num_format = stype == GLSL_TYPE_UINT + ? S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) + : S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED); + wa_formats = LLVMBuildAnd(ctx->builder, formats, + LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), ""); + wa_formats = + LLVMBuildOr(ctx->builder, wa_formats, LLVMConstInt(ctx->i32, wa_num_format, false), ""); + + formats = LLVMBuildSelect(ctx->builder, wa_8888, wa_formats, formats, ""); + args->resource = + LLVMBuildInsertElement(ctx->builder, args->resource, formats, ctx->i32_1, ""); + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { + assert(!wa_8888); + half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5); + } else { + struct ac_image_args resinfo = {}; + LLVMBasicBlockRef bbs[2]; + + LLVMValueRef unnorm = NULL; + LLVMValueRef default_offset = ctx->f32_0; + if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) { + /* In vulkan, whether the sampler uses unnormalized + * coordinates or not is a dynamic property of the + * sampler. Hence, to figure out whether or not we + * need to divide by the texture size, we need to test + * the sampler at runtime. This tests the bit set by + * radv_init_sampler(). + */ + LLVMValueRef sampler0 = + LLVMBuildExtractElement(ctx->builder, args->sampler, ctx->i32_0, ""); + sampler0 = LLVMBuildLShr(ctx->builder, sampler0, LLVMConstInt(ctx->i32, 15, false), ""); + sampler0 = LLVMBuildAnd(ctx->builder, sampler0, ctx->i32_1, ""); + unnorm = LLVMBuildICmp(ctx->builder, LLVMIntEQ, sampler0, ctx->i32_1, ""); + default_offset = LLVMConstReal(ctx->f32, -0.5); + } + + bbs[0] = LLVMGetInsertBlock(ctx->builder); + if (wa_8888 || unnorm) { + assert(!(wa_8888 && unnorm)); + LLVMValueRef not_needed = wa_8888 ? wa_8888 : unnorm; + /* Skip the texture size query entirely if we don't need it. */ + ac_build_ifcc(ctx, LLVMBuildNot(ctx->builder, not_needed, ""), 2000); + bbs[1] = LLVMGetInsertBlock(ctx->builder); + } + + /* Query the texture size. */ + resinfo.dim = ac_get_sampler_dim(ctx->chip_class, instr->sampler_dim, instr->is_array); + resinfo.opcode = ac_image_get_resinfo; + resinfo.dmask = 0xf; + resinfo.lod = ctx->i32_0; + resinfo.resource = args->resource; + resinfo.attributes = AC_FUNC_ATTR_READNONE; + LLVMValueRef size = ac_build_image_opcode(ctx, &resinfo); + + /* Compute -0.5 / size. */ + for (unsigned c = 0; c < 2; c++) { + half_texel[c] = + LLVMBuildExtractElement(ctx->builder, size, LLVMConstInt(ctx->i32, c, 0), ""); + half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, ""); + half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]); + half_texel[c] = + LLVMBuildFMul(ctx->builder, half_texel[c], LLVMConstReal(ctx->f32, -0.5), ""); + } + + if (wa_8888 || unnorm) { + ac_build_endif(ctx, 2000); + + for (unsigned c = 0; c < 2; c++) { + LLVMValueRef values[2] = {default_offset, half_texel[c]}; + half_texel[c] = ac_build_phi(ctx, ctx->f32, 2, values, bbs); + } + } + } + + for (unsigned c = 0; c < 2; c++) { + LLVMValueRef tmp; + tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, ""); + args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], ""); + } + + args->attributes = AC_FUNC_ATTR_READNONE; + result = ac_build_image_opcode(ctx, args); + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + LLVMValueRef tmp, tmp2; + + /* if the cube workaround is in place, f2i the result. */ + for (unsigned c = 0; c < 4; c++) { + tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), ""); + if (stype == GLSL_TYPE_UINT) + tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, ""); + else + tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, ""); + tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, ""); + tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, ""); + tmp = LLVMBuildSelect(ctx->builder, wa_8888, tmp2, tmp, ""); + tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, ""); + result = + LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), ""); + } + } + return result; } static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr) { - nir_deref_instr *texture_deref_instr = NULL; - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_texture_deref: - texture_deref_instr = nir_src_as_deref(instr->src[i].src); - break; - default: - break; - } - } - return texture_deref_instr; + nir_deref_instr *texture_deref_instr = NULL; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + default: + break; + } + } + return texture_deref_instr; } -static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, - const nir_tex_instr *instr, - struct ac_image_args *args) +static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_tex_instr *instr, + struct ac_image_args *args) { - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { - unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); - - assert(instr->dest.is_ssa); - return ac_build_buffer_load_format(&ctx->ac, - args->resource, - args->coords[0], - ctx->ac.i32_0, - util_last_bit(mask), - 0, true, - instr->dest.ssa.bit_size == 16); - } - - args->opcode = ac_image_sample; - - switch (instr->op) { - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_samples_identical: - args->opcode = args->level_zero || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? - ac_image_load : ac_image_load_mip; - args->level_zero = false; - break; - case nir_texop_txs: - case nir_texop_query_levels: - args->opcode = ac_image_get_resinfo; - if (!args->lod) - args->lod = ctx->ac.i32_0; - args->level_zero = false; - break; - case nir_texop_tex: - if (ctx->stage != MESA_SHADER_FRAGMENT) { - assert(!args->lod); - args->level_zero = true; - } - break; - case nir_texop_tg4: - args->opcode = ac_image_gather4; - if (!args->lod && !args->bias) - args->level_zero = true; - break; - case nir_texop_lod: - args->opcode = ac_image_get_lod; - break; - case nir_texop_fragment_fetch: - case nir_texop_fragment_mask_fetch: - args->opcode = ac_image_load; - args->level_zero = false; - break; - default: - break; - } - - if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) { - nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr); - nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr); - const struct glsl_type *type = glsl_without_array(var->type); - enum glsl_base_type stype = glsl_get_sampler_result_type(type); - if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) { - return lower_gather4_integer(&ctx->ac, var, args, instr); - } - } - - /* Fixup for GFX9 which allocates 1D textures as 2D. */ - if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) { - if ((args->dim == ac_image_2darray || - args->dim == ac_image_2d) && !args->coords[1]) { - args->coords[1] = ctx->ac.i32_0; - } - } - - args->attributes = AC_FUNC_ATTR_READNONE; - bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE && - ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE; - if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) { - /* Prevent texture instructions with implicit derivatives from being - * sinked into branches. */ - switch (instr->op) { - case nir_texop_tex: - case nir_texop_txb: - case nir_texop_lod: - args->attributes |= AC_FUNC_ATTR_CONVERGENT; - break; - default: - break; - } - } - - return ac_build_image_opcode(&ctx->ac, args); + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + + assert(instr->dest.is_ssa); + return ac_build_buffer_load_format(&ctx->ac, args->resource, args->coords[0], ctx->ac.i32_0, + util_last_bit(mask), 0, true, + instr->dest.ssa.bit_size == 16); + } + + args->opcode = ac_image_sample; + + switch (instr->op) { + case nir_texop_txf: + case nir_texop_txf_ms: + case nir_texop_samples_identical: + args->opcode = args->level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS + ? ac_image_load + : ac_image_load_mip; + args->level_zero = false; + break; + case nir_texop_txs: + case nir_texop_query_levels: + args->opcode = ac_image_get_resinfo; + if (!args->lod) + args->lod = ctx->ac.i32_0; + args->level_zero = false; + break; + case nir_texop_tex: + if (ctx->stage != MESA_SHADER_FRAGMENT) { + assert(!args->lod); + args->level_zero = true; + } + break; + case nir_texop_tg4: + args->opcode = ac_image_gather4; + if (!args->lod && !args->bias) + args->level_zero = true; + break; + case nir_texop_lod: + args->opcode = ac_image_get_lod; + break; + case nir_texop_fragment_fetch: + case nir_texop_fragment_mask_fetch: + args->opcode = ac_image_load; + args->level_zero = false; + break; + default: + break; + } + + if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) { + nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr); + nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr); + const struct glsl_type *type = glsl_without_array(var->type); + enum glsl_base_type stype = glsl_get_sampler_result_type(type); + if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) { + return lower_gather4_integer(&ctx->ac, var, args, instr); + } + } + + /* Fixup for GFX9 which allocates 1D textures as 2D. */ + if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) { + if ((args->dim == ac_image_2darray || args->dim == ac_image_2d) && !args->coords[1]) { + args->coords[1] = ctx->ac.i32_0; + } + } + + args->attributes = AC_FUNC_ATTR_READNONE; + bool cs_derivs = + ctx->stage == MESA_SHADER_COMPUTE && ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE; + if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) { + /* Prevent texture instructions with implicit derivatives from being + * sinked into branches. */ + switch (instr->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_lod: + args->attributes |= AC_FUNC_ATTR_CONVERGENT; + break; + default: + break; + } + } + + return ac_build_image_opcode(&ctx->ac, args); } static LLVMValueRef visit_vulkan_resource_reindex(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - LLVMValueRef ptr = get_src(ctx, instr->src[0]); - LLVMValueRef index = get_src(ctx, instr->src[1]); + LLVMValueRef ptr = get_src(ctx, instr->src[0]); + LLVMValueRef index = get_src(ctx, instr->src[1]); - LLVMValueRef result = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); - LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md); - return result; + LLVMValueRef result = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); + LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md); + return result; } -static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - LLVMValueRef ptr, addr; - LLVMValueRef src0 = get_src(ctx, instr->src[0]); - unsigned index = nir_intrinsic_base(instr); - - addr = LLVMConstInt(ctx->ac.i32, index, 0); - addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, ""); - - /* Load constant values from user SGPRS when possible, otherwise - * fallback to the default path that loads directly from memory. - */ - if (LLVMIsConstant(src0) && - instr->dest.ssa.bit_size == 32) { - unsigned count = instr->dest.ssa.num_components; - unsigned offset = index; - - offset += LLVMConstIntGetZExtValue(src0); - offset /= 4; - - offset -= ctx->args->base_inline_push_consts; - - unsigned num_inline_push_consts = ctx->args->num_inline_push_consts; - if (offset + count <= num_inline_push_consts) { - LLVMValueRef push_constants[num_inline_push_consts]; - for (unsigned i = 0; i < num_inline_push_consts; i++) - push_constants[i] = ac_get_arg(&ctx->ac, - ctx->args->inline_push_consts[i]); - return ac_build_gather_values(&ctx->ac, - push_constants + offset, - count); - } - } - - ptr = LLVMBuildGEP(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->push_constants), &addr, 1, ""); - - if (instr->dest.ssa.bit_size == 8) { - unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1; - LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords); - ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); - LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - - LLVMValueRef params[3]; - if (load_dwords > 1) { - LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, ""); - params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 1, false), ""); - params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, LLVMConstInt(ctx->ac.i32, 0, false), ""); - } else { - res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, ""); - params[0] = ctx->ac.i32_0; - params[1] = res; - } - params[2] = addr; - res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0); - - res = LLVMBuildTrunc(ctx->ac.builder, res, LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), ""); - if (instr->dest.ssa.num_components > 1) - res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), ""); - return res; - } else if (instr->dest.ssa.bit_size == 16) { - unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1; - LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords); - ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); - LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, ""); - LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, ""); - cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); - LLVMValueRef mask[] = { LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), - LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), - LLVMConstInt(ctx->ac.i32, 4, false)}; - LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components); - LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components); - LLVMValueRef shuffle_aligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, ""); - LLVMValueRef shuffle_unaligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, ""); - res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, ""); - return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), ""); - } - - ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa)); - - return LLVMBuildLoad(ctx->ac.builder, ptr, ""); + LLVMValueRef ptr, addr; + LLVMValueRef src0 = get_src(ctx, instr->src[0]); + unsigned index = nir_intrinsic_base(instr); + + addr = LLVMConstInt(ctx->ac.i32, index, 0); + addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, ""); + + /* Load constant values from user SGPRS when possible, otherwise + * fallback to the default path that loads directly from memory. + */ + if (LLVMIsConstant(src0) && instr->dest.ssa.bit_size == 32) { + unsigned count = instr->dest.ssa.num_components; + unsigned offset = index; + + offset += LLVMConstIntGetZExtValue(src0); + offset /= 4; + + offset -= ctx->args->base_inline_push_consts; + + unsigned num_inline_push_consts = ctx->args->num_inline_push_consts; + if (offset + count <= num_inline_push_consts) { + LLVMValueRef push_constants[num_inline_push_consts]; + for (unsigned i = 0; i < num_inline_push_consts; i++) + push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[i]); + return ac_build_gather_values(&ctx->ac, push_constants + offset, count); + } + } + + ptr = + LLVMBuildGEP(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->push_constants), &addr, 1, ""); + + if (instr->dest.ssa.bit_size == 8) { + unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1; + LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords); + ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); + LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + + LLVMValueRef params[3]; + if (load_dwords > 1) { + LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, ""); + params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, + LLVMConstInt(ctx->ac.i32, 1, false), ""); + params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec, + LLVMConstInt(ctx->ac.i32, 0, false), ""); + } else { + res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, ""); + params[0] = ctx->ac.i32_0; + params[1] = res; + } + params[2] = addr; + res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0); + + res = LLVMBuildTrunc( + ctx->ac.builder, res, + LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), ""); + if (instr->dest.ssa.num_components > 1) + res = LLVMBuildBitCast(ctx->ac.builder, res, + LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), ""); + return res; + } else if (instr->dest.ssa.bit_size == 16) { + unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1; + LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords); + ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type); + LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, ""); + LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, ""); + cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); + LLVMValueRef mask[] = { + LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), + LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), + LLVMConstInt(ctx->ac.i32, 4, false)}; + LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components); + LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components); + LLVMValueRef shuffle_aligned = + LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, ""); + LLVMValueRef shuffle_unaligned = + LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, ""); + res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, ""); + return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), ""); + } + + ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa)); + + return LLVMBuildLoad(ctx->ac.builder, ptr, ""); } static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef index = get_src(ctx, instr->src[0]); + LLVMValueRef index = get_src(ctx, instr->src[0]); - return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false); + return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false); } static uint32_t widen_mask(uint32_t mask, unsigned multiplier) { - uint32_t new_mask = 0; - for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) - if (mask & (1u << i)) - new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); - return new_mask; + uint32_t new_mask = 0; + for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) + if (mask & (1u << i)) + new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); + return new_mask; } static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned start, unsigned count) { - LLVMValueRef mask[] = { - ctx->i32_0, ctx->i32_1, - LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false) }; - - unsigned src_elements = ac_get_llvm_num_components(src); - - if (count == src_elements) { - assert(start == 0); - return src; - } else if (count == 1) { - assert(start < src_elements); - return LLVMBuildExtractElement(ctx->builder, src, mask[start], ""); - } else { - assert(start + count <= src_elements); - assert(count <= 4); - LLVMValueRef swizzle = LLVMConstVector(&mask[start], count); - return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, ""); - } + LLVMValueRef mask[] = {ctx->i32_0, ctx->i32_1, LLVMConstInt(ctx->i32, 2, false), + LLVMConstInt(ctx->i32, 3, false)}; + + unsigned src_elements = ac_get_llvm_num_components(src); + + if (count == src_elements) { + assert(start == 0); + return src; + } else if (count == 1) { + assert(start < src_elements); + return LLVMBuildExtractElement(ctx->builder, src, mask[start], ""); + } else { + assert(start + count <= src_elements); + assert(count <= 4); + LLVMValueRef swizzle = LLVMConstVector(&mask[start], count); + return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, ""); + } } -static unsigned get_cache_policy(struct ac_nir_context *ctx, - enum gl_access_qualifier access, - bool may_store_unaligned, - bool writeonly_memory) +static unsigned get_cache_policy(struct ac_nir_context *ctx, enum gl_access_qualifier access, + bool may_store_unaligned, bool writeonly_memory) { - unsigned cache_policy = 0; - - /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All - * store opcodes not aligned to a dword are affected. The only way to - * get unaligned stores is through shader images. - */ - if (((may_store_unaligned && ctx->ac.chip_class == GFX6) || - /* If this is write-only, don't keep data in L1 to prevent - * evicting L1 cache lines that may be needed by other - * instructions. - */ - writeonly_memory || - access & (ACCESS_COHERENT | ACCESS_VOLATILE))) { - cache_policy |= ac_glc; - } - - if (access & ACCESS_STREAM_CACHE_POLICY) - cache_policy |= ac_slc | ac_glc; - - return cache_policy; + unsigned cache_policy = 0; + + /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All + * store opcodes not aligned to a dword are affected. The only way to + * get unaligned stores is through shader images. + */ + if (((may_store_unaligned && ctx->ac.chip_class == GFX6) || + /* If this is write-only, don't keep data in L1 to prevent + * evicting L1 cache lines that may be needed by other + * instructions. + */ + writeonly_memory || access & (ACCESS_COHERENT | ACCESS_VOLATILE))) { + cache_policy |= ac_glc; + } + + if (access & ACCESS_STREAM_CACHE_POLICY) + cache_policy |= ac_slc | ac_glc; + + return cache_policy; } -static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, - struct waterfall_context *wctx, - const nir_intrinsic_instr *instr, - nir_src src) +static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, struct waterfall_context *wctx, + const nir_intrinsic_instr *instr, nir_src src) { - return enter_waterfall(ctx, wctx, get_src(ctx, src), - nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); + return enter_waterfall(ctx, wctx, get_src(ctx, src), + nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); } -static void visit_store_ssbo(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7000); - } - - LLVMValueRef src_data = get_src(ctx, instr->src[0]); - int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8; - unsigned writemask = nir_intrinsic_write_mask(instr); - enum gl_access_qualifier access = nir_intrinsic_access(instr); - bool writeonly_memory = access & ACCESS_NON_READABLE; - unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory); - - struct waterfall_context wctx; - LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]); - - LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true); - LLVMValueRef base_data = src_data; - base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components); - LLVMValueRef base_offset = get_src(ctx, instr->src[2]); - - while (writemask) { - int start, count; - LLVMValueRef data, offset; - LLVMTypeRef data_type; - - u_bit_scan_consecutive_range(&writemask, &start, &count); - - /* Due to an LLVM limitation with LLVM < 9, split 3-element - * writes into a 2-element and a 1-element write. */ - if (count == 3 && - (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) { - writemask |= 1 << (start + 2); - count = 2; - } - int num_bytes = count * elem_size_bytes; /* count in bytes */ - - /* we can only store 4 DWords at the same time. - * can only happen for 64 Bit vectors. */ - if (num_bytes > 16) { - writemask |= ((1u << (count - 2)) - 1u) << (start + 2); - count = 2; - num_bytes = 16; - } - - /* check alignment of 16 Bit stores */ - if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) { - writemask |= ((1u << (count - 1)) - 1u) << (start + 1); - count = 1; - num_bytes = 2; - } - - /* Due to alignment issues, split stores of 8-bit/16-bit - * vectors. - */ - if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) { - writemask |= ((1u << (count - 1)) - 1u) << (start + 1); - count = 1; - num_bytes = elem_size_bytes; - } - - data = extract_vector_range(&ctx->ac, base_data, start, count); - - offset = LLVMBuildAdd(ctx->ac.builder, base_offset, - LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); - - if (num_bytes == 1) { - ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data, - offset, ctx->ac.i32_0, - cache_policy); - } else if (num_bytes == 2) { - ac_build_tbuffer_store_short(&ctx->ac, rsrc, data, - offset, ctx->ac.i32_0, - cache_policy); - } else { - int num_channels = num_bytes / 4; - - switch (num_bytes) { - case 16: /* v4f32 */ - data_type = ctx->ac.v4f32; - break; - case 12: /* v3f32 */ - data_type = ctx->ac.v3f32; - break; - case 8: /* v2f32 */ - data_type = ctx->ac.v2f32; - break; - case 4: /* f32 */ - data_type = ctx->ac.f32; - break; - default: - unreachable("Malformed vector store."); - } - data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, ""); - - ac_build_buffer_store_dword(&ctx->ac, rsrc, data, - num_channels, offset, - ctx->ac.i32_0, 0, - cache_policy); - } - } - - exit_waterfall(ctx, &wctx, NULL); - - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7000); + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7000); + } + + LLVMValueRef src_data = get_src(ctx, instr->src[0]); + int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8; + unsigned writemask = nir_intrinsic_write_mask(instr); + enum gl_access_qualifier access = nir_intrinsic_access(instr); + bool writeonly_memory = access & ACCESS_NON_READABLE; + unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory); + + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]); + + LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true); + LLVMValueRef base_data = src_data; + base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components); + LLVMValueRef base_offset = get_src(ctx, instr->src[2]); + + while (writemask) { + int start, count; + LLVMValueRef data, offset; + LLVMTypeRef data_type; + + u_bit_scan_consecutive_range(&writemask, &start, &count); + + /* Due to an LLVM limitation with LLVM < 9, split 3-element + * writes into a 2-element and a 1-element write. */ + if (count == 3 && (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) { + writemask |= 1 << (start + 2); + count = 2; + } + int num_bytes = count * elem_size_bytes; /* count in bytes */ + + /* we can only store 4 DWords at the same time. + * can only happen for 64 Bit vectors. */ + if (num_bytes > 16) { + writemask |= ((1u << (count - 2)) - 1u) << (start + 2); + count = 2; + num_bytes = 16; + } + + /* check alignment of 16 Bit stores */ + if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) { + writemask |= ((1u << (count - 1)) - 1u) << (start + 1); + count = 1; + num_bytes = 2; + } + + /* Due to alignment issues, split stores of 8-bit/16-bit + * vectors. + */ + if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) { + writemask |= ((1u << (count - 1)) - 1u) << (start + 1); + count = 1; + num_bytes = elem_size_bytes; + } + + data = extract_vector_range(&ctx->ac, base_data, start, count); + + offset = LLVMBuildAdd(ctx->ac.builder, base_offset, + LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), ""); + + if (num_bytes == 1) { + ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy); + } else if (num_bytes == 2) { + ac_build_tbuffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy); + } else { + int num_channels = num_bytes / 4; + + switch (num_bytes) { + case 16: /* v4f32 */ + data_type = ctx->ac.v4f32; + break; + case 12: /* v3f32 */ + data_type = ctx->ac.v3f32; + break; + case 8: /* v2f32 */ + data_type = ctx->ac.v2f32; + break; + case 4: /* f32 */ + data_type = ctx->ac.f32; + break; + default: + unreachable("Malformed vector store."); + } + data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, ""); + + ac_build_buffer_store_dword(&ctx->ac, rsrc, data, num_channels, offset, ctx->ac.i32_0, 0, + cache_policy); + } + } + + exit_waterfall(ctx, &wctx, NULL); + + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7000); } -static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx, - LLVMValueRef descriptor, - LLVMValueRef offset, - LLVMValueRef compare, - LLVMValueRef exchange) +static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx, LLVMValueRef descriptor, + LLVMValueRef offset, LLVMValueRef compare, + LLVMValueRef exchange) { - LLVMBasicBlockRef start_block = NULL, then_block = NULL; - if (ctx->abi->robust_buffer_access) { - LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2); - - LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, ""); - start_block = LLVMGetInsertBlock(ctx->ac.builder); - - ac_build_ifcc(&ctx->ac, cond, -1); - - then_block = LLVMGetInsertBlock(ctx->ac.builder); - } - - LLVMValueRef ptr_parts[2] = { - ac_llvm_extract_elem(&ctx->ac, descriptor, 0), - LLVMBuildAnd(ctx->ac.builder, - ac_llvm_extract_elem(&ctx->ac, descriptor, 1), - LLVMConstInt(ctx->ac.i32, 65535, 0), "") - }; - - ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, ""); - ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, ""); - - offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, ""); - - LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2); - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, ""); - ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, ""); - ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL), ""); - - LLVMValueRef result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as"); - result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); - - if (ctx->abi->robust_buffer_access) { - ac_build_endif(&ctx->ac, -1); - - LLVMBasicBlockRef incoming_blocks[2] = { - start_block, - then_block, - }; - - LLVMValueRef incoming_values[2] = { - LLVMConstInt(ctx->ac.i64, 0, 0), - result, - }; - LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, ""); - LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2); - return ret; - } else { - return result; - } + LLVMBasicBlockRef start_block = NULL, then_block = NULL; + if (ctx->abi->robust_buffer_access) { + LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2); + + LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, ""); + start_block = LLVMGetInsertBlock(ctx->ac.builder); + + ac_build_ifcc(&ctx->ac, cond, -1); + + then_block = LLVMGetInsertBlock(ctx->ac.builder); + } + + LLVMValueRef ptr_parts[2] = { + ac_llvm_extract_elem(&ctx->ac, descriptor, 0), + LLVMBuildAnd(ctx->ac.builder, ac_llvm_extract_elem(&ctx->ac, descriptor, 1), + LLVMConstInt(ctx->ac.i32, 65535, 0), "")}; + + ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, ""); + ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, ""); + + offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, ""); + + LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, ""); + ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, ""); + ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL), + ""); + + LLVMValueRef result = + ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as"); + result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); + + if (ctx->abi->robust_buffer_access) { + ac_build_endif(&ctx->ac, -1); + + LLVMBasicBlockRef incoming_blocks[2] = { + start_block, + then_block, + }; + + LLVMValueRef incoming_values[2] = { + LLVMConstInt(ctx->ac.i64, 0, 0), + result, + }; + LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, ""); + LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2); + return ret; + } else { + return result; + } } -static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7001); - } - - LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2])); - const char *op; - char name[64], type[8]; - LLVMValueRef params[6], descriptor; - LLVMValueRef result; - int arg_count = 0; - - struct waterfall_context wctx; - LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]); - - switch (instr->intrinsic) { - case nir_intrinsic_ssbo_atomic_add: - op = "add"; - break; - case nir_intrinsic_ssbo_atomic_imin: - op = "smin"; - break; - case nir_intrinsic_ssbo_atomic_umin: - op = "umin"; - break; - case nir_intrinsic_ssbo_atomic_imax: - op = "smax"; - break; - case nir_intrinsic_ssbo_atomic_umax: - op = "umax"; - break; - case nir_intrinsic_ssbo_atomic_and: - op = "and"; - break; - case nir_intrinsic_ssbo_atomic_or: - op = "or"; - break; - case nir_intrinsic_ssbo_atomic_xor: - op = "xor"; - break; - case nir_intrinsic_ssbo_atomic_exchange: - op = "swap"; - break; - case nir_intrinsic_ssbo_atomic_comp_swap: - op = "cmpswap"; - break; - default: - abort(); - } - - descriptor = ctx->abi->load_ssbo(ctx->abi, - rsrc_base, - true); - - if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap && - return_type == ctx->ac.i64) { - result = emit_ssbo_comp_swap_64(ctx, descriptor, - get_src(ctx, instr->src[1]), - get_src(ctx, instr->src[2]), - get_src(ctx, instr->src[3])); - } else { - if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) { - params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0); - } - params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); - params[arg_count++] = descriptor; - - if (LLVM_VERSION_MAJOR >= 9) { - /* XXX: The new raw/struct atomic intrinsics are buggy with - * LLVM 8, see r358579. - */ - params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ - params[arg_count++] = ctx->ac.i32_0; /* soffset */ - params[arg_count++] = ctx->ac.i32_0; /* slc */ - - ac_build_type_name_for_intr(return_type, type, sizeof(type)); - snprintf(name, sizeof(name), - "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type); - } else { - params[arg_count++] = ctx->ac.i32_0; /* vindex */ - params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ - params[arg_count++] = ctx->ac.i1false; /* slc */ - - assert(return_type == ctx->ac.i32); - snprintf(name, sizeof(name), - "llvm.amdgcn.buffer.atomic.%s", op); - } - - result = ac_build_intrinsic(&ctx->ac, name, return_type, params, - arg_count, 0); - } - - result = exit_waterfall(ctx, &wctx, result); - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7001); - return result; + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7001); + } + + LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2])); + const char *op; + char name[64], type[8]; + LLVMValueRef params[6], descriptor; + LLVMValueRef result; + int arg_count = 0; + + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]); + + switch (instr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + op = "add"; + break; + case nir_intrinsic_ssbo_atomic_imin: + op = "smin"; + break; + case nir_intrinsic_ssbo_atomic_umin: + op = "umin"; + break; + case nir_intrinsic_ssbo_atomic_imax: + op = "smax"; + break; + case nir_intrinsic_ssbo_atomic_umax: + op = "umax"; + break; + case nir_intrinsic_ssbo_atomic_and: + op = "and"; + break; + case nir_intrinsic_ssbo_atomic_or: + op = "or"; + break; + case nir_intrinsic_ssbo_atomic_xor: + op = "xor"; + break; + case nir_intrinsic_ssbo_atomic_exchange: + op = "swap"; + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + op = "cmpswap"; + break; + default: + abort(); + } + + descriptor = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true); + + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap && return_type == ctx->ac.i64) { + result = emit_ssbo_comp_swap_64(ctx, descriptor, get_src(ctx, instr->src[1]), + get_src(ctx, instr->src[2]), get_src(ctx, instr->src[3])); + } else { + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) { + params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0); + } + params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); + params[arg_count++] = descriptor; + + if (LLVM_VERSION_MAJOR >= 9) { + /* XXX: The new raw/struct atomic intrinsics are buggy with + * LLVM 8, see r358579. + */ + params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ + params[arg_count++] = ctx->ac.i32_0; /* soffset */ + params[arg_count++] = ctx->ac.i32_0; /* slc */ + + ac_build_type_name_for_intr(return_type, type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type); + } else { + params[arg_count++] = ctx->ac.i32_0; /* vindex */ + params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */ + params[arg_count++] = ctx->ac.i1false; /* slc */ + + assert(return_type == ctx->ac.i32); + snprintf(name, sizeof(name), "llvm.amdgcn.buffer.atomic.%s", op); + } + + result = ac_build_intrinsic(&ctx->ac, name, return_type, params, arg_count, 0); + } + + result = exit_waterfall(ctx, &wctx, result); + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7001); + return result; } -static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - struct waterfall_context wctx; - LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]); - - int elem_size_bytes = instr->dest.ssa.bit_size / 8; - int num_components = instr->num_components; - enum gl_access_qualifier access = nir_intrinsic_access(instr); - unsigned cache_policy = get_cache_policy(ctx, access, false, false); - - LLVMValueRef offset = get_src(ctx, instr->src[1]); - LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false); - LLVMValueRef vindex = ctx->ac.i32_0; - - LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa); - LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type; - - LLVMValueRef results[4]; - for (int i = 0; i < num_components;) { - int num_elems = num_components - i; - if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0) - num_elems = 1; - if (num_elems * elem_size_bytes > 16) - num_elems = 16 / elem_size_bytes; - int load_bytes = num_elems * elem_size_bytes; - - LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); - - LLVMValueRef ret; - - if (load_bytes == 1) { - ret = ac_build_tbuffer_load_byte(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - cache_policy); - } else if (load_bytes == 2) { - ret = ac_build_tbuffer_load_short(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - cache_policy); - } else { - int num_channels = util_next_power_of_two(load_bytes) / 4; - bool can_speculate = access & ACCESS_CAN_REORDER; - - ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, - vindex, offset, immoffset, 0, - cache_policy, can_speculate, false); - } - - LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret))); - ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, ""); - ret = ac_trim_vector(&ctx->ac, ret, load_bytes); - - LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems); - ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, ""); - - for (unsigned j = 0; j < num_elems; j++) { - results[i + j] = LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), ""); - } - i += num_elems; - } - - LLVMValueRef ret = ac_build_gather_values(&ctx->ac, results, num_components); - return exit_waterfall(ctx, &wctx, ret); + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]); + + int elem_size_bytes = instr->dest.ssa.bit_size / 8; + int num_components = instr->num_components; + enum gl_access_qualifier access = nir_intrinsic_access(instr); + unsigned cache_policy = get_cache_policy(ctx, access, false, false); + + LLVMValueRef offset = get_src(ctx, instr->src[1]); + LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false); + LLVMValueRef vindex = ctx->ac.i32_0; + + LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa); + LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type; + + LLVMValueRef results[4]; + for (int i = 0; i < num_components;) { + int num_elems = num_components - i; + if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0) + num_elems = 1; + if (num_elems * elem_size_bytes > 16) + num_elems = 16 / elem_size_bytes; + int load_bytes = num_elems * elem_size_bytes; + + LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false); + + LLVMValueRef ret; + + if (load_bytes == 1) { + ret = ac_build_tbuffer_load_byte(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset, + cache_policy); + } else if (load_bytes == 2) { + ret = ac_build_tbuffer_load_short(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset, + cache_policy); + } else { + int num_channels = util_next_power_of_two(load_bytes) / 4; + bool can_speculate = access & ACCESS_CAN_REORDER; + + ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, vindex, offset, immoffset, 0, + cache_policy, can_speculate, false); + } + + LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret))); + ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, ""); + ret = ac_trim_vector(&ctx->ac, ret, load_bytes); + + LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems); + ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, ""); + + for (unsigned j = 0; j < num_elems; j++) { + results[i + j] = + LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), ""); + } + i += num_elems; + } + + LLVMValueRef ret = ac_build_gather_values(&ctx->ac, results, num_components); + return exit_waterfall(ctx, &wctx, ret); } -static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx, - struct waterfall_context *wctx, - const nir_intrinsic_instr *instr) +static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx, struct waterfall_context *wctx, + const nir_intrinsic_instr *instr) { - return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]), - nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); + return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]), + nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); } -static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - struct waterfall_context wctx; - LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr); - - LLVMValueRef ret; - LLVMValueRef rsrc = rsrc_base; - LLVMValueRef offset = get_src(ctx, instr->src[1]); - int num_components = instr->num_components; - - if (ctx->abi->load_ubo) - rsrc = ctx->abi->load_ubo(ctx->abi, rsrc); - - if (instr->dest.ssa.bit_size == 64) - num_components *= 2; - - if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) { - unsigned load_bytes = instr->dest.ssa.bit_size / 8; - LLVMValueRef results[num_components]; - for (unsigned i = 0; i < num_components; ++i) { - LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, - load_bytes * i, 0); - - if (load_bytes == 1) { - results[i] = ac_build_tbuffer_load_byte(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - 0); - } else { - assert(load_bytes == 2); - results[i] = ac_build_tbuffer_load_short(&ctx->ac, - rsrc, - offset, - ctx->ac.i32_0, - immoffset, - 0); - } - } - ret = ac_build_gather_values(&ctx->ac, results, num_components); - } else { - ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, - NULL, 0, 0, true, true); - - ret = ac_trim_vector(&ctx->ac, ret, num_components); - } - - ret = LLVMBuildBitCast(ctx->ac.builder, ret, - get_def_type(ctx, &instr->dest.ssa), ""); - - return exit_waterfall(ctx, &wctx, ret); + struct waterfall_context wctx; + LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr); + + LLVMValueRef ret; + LLVMValueRef rsrc = rsrc_base; + LLVMValueRef offset = get_src(ctx, instr->src[1]); + int num_components = instr->num_components; + + if (ctx->abi->load_ubo) + rsrc = ctx->abi->load_ubo(ctx->abi, rsrc); + + if (instr->dest.ssa.bit_size == 64) + num_components *= 2; + + if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) { + unsigned load_bytes = instr->dest.ssa.bit_size / 8; + LLVMValueRef results[num_components]; + for (unsigned i = 0; i < num_components; ++i) { + LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, load_bytes * i, 0); + + if (load_bytes == 1) { + results[i] = + ac_build_tbuffer_load_byte(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset, 0); + } else { + assert(load_bytes == 2); + results[i] = + ac_build_tbuffer_load_short(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset, 0); + } + } + ret = ac_build_gather_values(&ctx->ac, results, num_components); + } else { + ret = + ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, NULL, 0, 0, true, true); + + ret = ac_trim_vector(&ctx->ac, ret, num_components); + } + + ret = LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); + + return exit_waterfall(ctx, &wctx, ret); } -static void -get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr, - bool vs_in, unsigned *vertex_index_out, - LLVMValueRef *vertex_index_ref, - unsigned *const_out, LLVMValueRef *indir_out) +static void get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr, bool vs_in, + unsigned *vertex_index_out, LLVMValueRef *vertex_index_ref, + unsigned *const_out, LLVMValueRef *indir_out) { - nir_variable *var = nir_deref_instr_get_variable(instr); - nir_deref_path path; - unsigned idx_lvl = 1; - - nir_deref_path_init(&path, instr, NULL); - - if (vertex_index_out != NULL || vertex_index_ref != NULL) { - if (vertex_index_ref) { - *vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index); - if (vertex_index_out) - *vertex_index_out = 0; - } else { - *vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index); - } - ++idx_lvl; - } - - uint32_t const_offset = 0; - LLVMValueRef offset = NULL; - - if (var->data.compact) { - assert(instr->deref_type == nir_deref_type_array); - const_offset = nir_src_as_uint(instr->arr.index); - goto out; - } - - for (; path.path[idx_lvl]; ++idx_lvl) { - const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type; - if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) { - unsigned index = path.path[idx_lvl]->strct.index; - - for (unsigned i = 0; i < index; i++) { - const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); - const_offset += glsl_count_attribute_slots(ft, vs_in); - } - } else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) { - unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in); - if (nir_src_is_const(path.path[idx_lvl]->arr.index)) { - const_offset += size * - nir_src_as_uint(path.path[idx_lvl]->arr.index); - } else { - LLVMValueRef array_off = LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0), - get_src(ctx, path.path[idx_lvl]->arr.index), ""); - if (offset) - offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, ""); - else - offset = array_off; - } - } else - unreachable("Uhandled deref type in get_deref_instr_offset"); - } + nir_variable *var = nir_deref_instr_get_variable(instr); + nir_deref_path path; + unsigned idx_lvl = 1; + + nir_deref_path_init(&path, instr, NULL); + + if (vertex_index_out != NULL || vertex_index_ref != NULL) { + if (vertex_index_ref) { + *vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index); + if (vertex_index_out) + *vertex_index_out = 0; + } else { + *vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index); + } + ++idx_lvl; + } + + uint32_t const_offset = 0; + LLVMValueRef offset = NULL; + + if (var->data.compact) { + assert(instr->deref_type == nir_deref_type_array); + const_offset = nir_src_as_uint(instr->arr.index); + goto out; + } + + for (; path.path[idx_lvl]; ++idx_lvl) { + const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type; + if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) { + unsigned index = path.path[idx_lvl]->strct.index; + + for (unsigned i = 0; i < index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + const_offset += glsl_count_attribute_slots(ft, vs_in); + } + } else if (path.path[idx_lvl]->deref_type == nir_deref_type_array) { + unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in); + if (nir_src_is_const(path.path[idx_lvl]->arr.index)) { + const_offset += size * nir_src_as_uint(path.path[idx_lvl]->arr.index); + } else { + LLVMValueRef array_off = + LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0), + get_src(ctx, path.path[idx_lvl]->arr.index), ""); + if (offset) + offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, ""); + else + offset = array_off; + } + } else + unreachable("Uhandled deref type in get_deref_instr_offset"); + } out: - nir_deref_path_finish(&path); + nir_deref_path_finish(&path); - if (const_offset && offset) - offset = LLVMBuildAdd(ctx->ac.builder, offset, - LLVMConstInt(ctx->ac.i32, const_offset, 0), - ""); + if (const_offset && offset) + offset = + LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, const_offset, 0), ""); - *const_out = const_offset; - *indir_out = offset; + *const_out = const_offset; + *indir_out = offset; } -static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr, - bool load_inputs) +static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx, nir_intrinsic_instr *instr, + bool load_inputs) { - LLVMValueRef result; - LLVMValueRef vertex_index = NULL; - LLVMValueRef indir_index = NULL; - unsigned const_index = 0; - - nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - - unsigned location = var->data.location; - unsigned driver_location = var->data.driver_location; - const bool is_patch = var->data.patch || - var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || - var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; - const bool is_compact = var->data.compact; - - get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), - false, NULL, is_patch ? NULL : &vertex_index, - &const_index, &indir_index); - - LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa); - - LLVMTypeRef src_component_type; - if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) - src_component_type = LLVMGetElementType(dest_type); - else - src_component_type = dest_type; - - result = ctx->abi->load_tess_varyings(ctx->abi, src_component_type, - vertex_index, indir_index, - const_index, location, driver_location, - var->data.location_frac, - instr->num_components, - is_patch, is_compact, load_inputs); - if (instr->dest.ssa.bit_size == 16) { - result = ac_to_integer(&ctx->ac, result); - result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); - } - return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); + LLVMValueRef result; + LLVMValueRef vertex_index = NULL; + LLVMValueRef indir_index = NULL; + unsigned const_index = 0; + + nir_variable *var = + nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + + unsigned location = var->data.location; + unsigned driver_location = var->data.driver_location; + const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; + const bool is_compact = var->data.compact; + + get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false, NULL, + is_patch ? NULL : &vertex_index, &const_index, &indir_index); + + LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa); + + LLVMTypeRef src_component_type; + if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) + src_component_type = LLVMGetElementType(dest_type); + else + src_component_type = dest_type; + + result = + ctx->abi->load_tess_varyings(ctx->abi, src_component_type, vertex_index, indir_index, + const_index, location, driver_location, var->data.location_frac, + instr->num_components, is_patch, is_compact, load_inputs); + if (instr->dest.ssa.bit_size == 16) { + result = ac_to_integer(&ctx->ac, result); + result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); + } + return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); } -static unsigned -type_scalar_size_bytes(const struct glsl_type *type) +static unsigned type_scalar_size_bytes(const struct glsl_type *type) { - assert(glsl_type_is_vector_or_scalar(type) || - glsl_type_is_matrix(type)); + assert(glsl_type_is_vector_or_scalar(type) || glsl_type_is_matrix(type)); return glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; } -static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); - nir_variable *var = nir_deref_instr_get_variable(deref); - - LLVMValueRef values[8]; - int idx = 0; - int ve = instr->dest.ssa.num_components; - unsigned comp = 0; - LLVMValueRef indir_index; - LLVMValueRef ret; - unsigned const_index; - unsigned stride = 4; - int mode = deref->mode; - - if (var) { - bool vs_in = ctx->stage == MESA_SHADER_VERTEX && - var->data.mode == nir_var_shader_in; - idx = var->data.driver_location; - comp = var->data.location_frac; - mode = var->data.mode; - - get_deref_offset(ctx, deref, vs_in, NULL, NULL, - &const_index, &indir_index); - - if (var->data.compact) { - stride = 1; - const_index += comp; - comp = 0; - } - } - - if (instr->dest.ssa.bit_size == 64 && - (deref->mode == nir_var_shader_in || - deref->mode == nir_var_shader_out || - deref->mode == nir_var_function_temp)) - ve *= 2; - - switch (mode) { - case nir_var_shader_in: - /* TODO: remove this after RADV switches to lowered IO */ - if (ctx->stage == MESA_SHADER_TESS_CTRL || - ctx->stage == MESA_SHADER_TESS_EVAL) { - return load_tess_varyings(ctx, instr, true); - } - - if (ctx->stage == MESA_SHADER_GEOMETRY) { - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); - LLVMValueRef indir_index; - unsigned const_index, vertex_index; - get_deref_offset(ctx, deref, false, &vertex_index, NULL, - &const_index, &indir_index); - assert(indir_index == NULL); - - return ctx->abi->load_inputs(ctx->abi, var->data.location, - var->data.driver_location, - var->data.location_frac, - instr->num_components, vertex_index, const_index, type); - } - - for (unsigned chan = comp; chan < ve + comp; chan++) { - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, - ctx->stage == MESA_SHADER_VERTEX); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->inputs + idx + chan, count, - stride, false, true); - - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - tmp_vec, - indir_index, ""); - } else - values[chan] = ctx->abi->inputs[idx + chan + const_index * stride]; - } - break; - case nir_var_function_temp: - for (unsigned chan = 0; chan < ve; chan++) { - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->locals + idx + chan, count, - stride, true, true); - - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - tmp_vec, - indir_index, ""); - } else { - values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], ""); - } - } - break; - case nir_var_shader_out: - /* TODO: remove this after RADV switches to lowered IO */ - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - return load_tess_varyings(ctx, instr, false); - } - - if (ctx->stage == MESA_SHADER_FRAGMENT && - var->data.fb_fetch_output && - ctx->abi->emit_fbfetch) - return ctx->abi->emit_fbfetch(ctx->abi); - - for (unsigned chan = comp; chan < ve + comp; chan++) { - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->outputs + idx + chan, count, - stride, true, true); - - values[chan] = LLVMBuildExtractElement(ctx->ac.builder, - tmp_vec, - indir_index, ""); - } else { - values[chan] = LLVMBuildLoad(ctx->ac.builder, - ctx->abi->outputs[idx + chan + const_index * stride], - ""); - } - } - break; - case nir_var_mem_global: { - LLVMValueRef address = get_src(ctx, instr->src[0]); - LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa); - unsigned explicit_stride = glsl_get_explicit_stride(deref->type); - unsigned natural_stride = type_scalar_size_bytes(deref->type); - unsigned stride = explicit_stride ? explicit_stride : natural_stride; - int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8; - bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4; - - if (stride != natural_stride || split_loads) { - if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind) - result_type = LLVMGetElementType(result_type); - - LLVMTypeRef ptr_type = LLVMPointerType(result_type, - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - - for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) { - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0); - values[i] = LLVMBuildLoad(ctx->ac.builder, - ac_build_gep_ptr(&ctx->ac, address, offset), ""); - - if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) - LLVMSetOrdering(values[i], LLVMAtomicOrderingMonotonic); - } - return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components); - } else { - LLVMTypeRef ptr_type = LLVMPointerType(result_type, - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, ""); - - if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) - LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic); - return val; - } - } - default: - unreachable("unhandle variable mode"); - } - ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp); - return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + + LLVMValueRef values[8]; + int idx = 0; + int ve = instr->dest.ssa.num_components; + unsigned comp = 0; + LLVMValueRef indir_index; + LLVMValueRef ret; + unsigned const_index; + unsigned stride = 4; + int mode = deref->mode; + + if (var) { + bool vs_in = ctx->stage == MESA_SHADER_VERTEX && var->data.mode == nir_var_shader_in; + idx = var->data.driver_location; + comp = var->data.location_frac; + mode = var->data.mode; + + get_deref_offset(ctx, deref, vs_in, NULL, NULL, &const_index, &indir_index); + + if (var->data.compact) { + stride = 1; + const_index += comp; + comp = 0; + } + } + + if (instr->dest.ssa.bit_size == 64 && + (deref->mode == nir_var_shader_in || deref->mode == nir_var_shader_out || + deref->mode == nir_var_function_temp)) + ve *= 2; + + switch (mode) { + case nir_var_shader_in: + /* TODO: remove this after RADV switches to lowered IO */ + if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) { + return load_tess_varyings(ctx, instr, true); + } + + if (ctx->stage == MESA_SHADER_GEOMETRY) { + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMValueRef indir_index; + unsigned const_index, vertex_index; + get_deref_offset(ctx, deref, false, &vertex_index, NULL, &const_index, &indir_index); + assert(indir_index == NULL); + + return ctx->abi->load_inputs(ctx->abi, var->data.location, var->data.driver_location, + var->data.location_frac, instr->num_components, vertex_index, + const_index, type); + } + + for (unsigned chan = comp; chan < ve + comp; chan++) { + if (indir_index) { + unsigned count = + glsl_count_attribute_slots(var->type, ctx->stage == MESA_SHADER_VERTEX); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->abi->inputs + idx + chan, count, stride, false, true); + + values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, ""); + } else + values[chan] = ctx->abi->inputs[idx + chan + const_index * stride]; + } + break; + case nir_var_function_temp: + for (unsigned chan = 0; chan < ve; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots(var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->locals + idx + chan, count, stride, true, true); + + values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, ""); + } else { + values[chan] = + LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], ""); + } + } + break; + case nir_var_shader_out: + /* TODO: remove this after RADV switches to lowered IO */ + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + return load_tess_varyings(ctx, instr, false); + } + + if (ctx->stage == MESA_SHADER_FRAGMENT && var->data.fb_fetch_output && ctx->abi->emit_fbfetch) + return ctx->abi->emit_fbfetch(ctx->abi); + + for (unsigned chan = comp; chan < ve + comp; chan++) { + if (indir_index) { + unsigned count = glsl_count_attribute_slots(var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->abi->outputs + idx + chan, count, stride, true, true); + + values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, ""); + } else { + values[chan] = LLVMBuildLoad(ctx->ac.builder, + ctx->abi->outputs[idx + chan + const_index * stride], ""); + } + } + break; + case nir_var_mem_global: { + LLVMValueRef address = get_src(ctx, instr->src[0]); + LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa); + unsigned explicit_stride = glsl_get_explicit_stride(deref->type); + unsigned natural_stride = type_scalar_size_bytes(deref->type); + unsigned stride = explicit_stride ? explicit_stride : natural_stride; + int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8; + bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4; + + if (stride != natural_stride || split_loads) { + if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind) + result_type = LLVMGetElementType(result_type); + + LLVMTypeRef ptr_type = + LLVMPointerType(result_type, LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, ""); + + for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) { + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0); + values[i] = + LLVMBuildLoad(ctx->ac.builder, ac_build_gep_ptr(&ctx->ac, address, offset), ""); + + if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) + LLVMSetOrdering(values[i], LLVMAtomicOrderingMonotonic); + } + return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components); + } else { + LLVMTypeRef ptr_type = + LLVMPointerType(result_type, LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, ""); + LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, ""); + + if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) + LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic); + return val; + } + } + default: + unreachable("unhandle variable mode"); + } + ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp); + return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); } -static void -visit_store_var(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static void visit_store_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7002); - } - - nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); - nir_variable *var = nir_deref_instr_get_variable(deref); - - LLVMValueRef temp_ptr, value; - int idx = 0; - unsigned comp = 0; - LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1])); - int writemask = instr->const_index[0]; - LLVMValueRef indir_index; - unsigned const_index; - - if (var) { - get_deref_offset(ctx, deref, false, - NULL, NULL, &const_index, &indir_index); - idx = var->data.driver_location; - comp = var->data.location_frac; - - if (var->data.compact) { - const_index += comp; - comp = 0; - } - } - - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64 && - (deref->mode == nir_var_shader_out || - deref->mode == nir_var_function_temp)) { - - src = LLVMBuildBitCast(ctx->ac.builder, src, - LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), - ""); - - writemask = widen_mask(writemask, 2); - } - - writemask = writemask << comp; - - switch (deref->mode) { - case nir_var_shader_out: - /* TODO: remove this after RADV switches to lowered IO */ - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - LLVMValueRef vertex_index = NULL; - LLVMValueRef indir_index = NULL; - unsigned const_index = 0; - const bool is_patch = var->data.patch || - var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || - var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; - - get_deref_offset(ctx, deref, false, NULL, - is_patch ? NULL : &vertex_index, - &const_index, &indir_index); - - ctx->abi->store_tcs_outputs(ctx->abi, var, - vertex_index, indir_index, - const_index, src, writemask, - var->data.location_frac, - var->data.driver_location); - break; - } - - for (unsigned chan = 0; chan < 8; chan++) { - int stride = 4; - if (!(writemask & (1 << chan))) - continue; - - value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp); - - if (var->data.compact) - stride = 1; - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->outputs + idx + chan, count, - stride, true, true); - - tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, - value, indir_index, ""); - build_store_values_extended(&ctx->ac, ctx->abi->outputs + idx + chan, - count, stride, tmp_vec); - - } else { - temp_ptr = ctx->abi->outputs[idx + chan + const_index * stride]; - - LLVMBuildStore(ctx->ac.builder, value, temp_ptr); - } - } - break; - case nir_var_function_temp: - for (unsigned chan = 0; chan < 8; chan++) { - if (!(writemask & (1 << chan))) - continue; - - value = ac_llvm_extract_elem(&ctx->ac, src, chan); - if (indir_index) { - unsigned count = glsl_count_attribute_slots( - var->type, false); - count -= chan / 4; - LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->locals + idx + chan, count, - 4, true, true); - - tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, - value, indir_index, ""); - build_store_values_extended(&ctx->ac, ctx->locals + idx + chan, - count, 4, tmp_vec); - } else { - temp_ptr = ctx->locals[idx + chan + const_index * 4]; - - LLVMBuildStore(ctx->ac.builder, value, temp_ptr); - } - } - break; - - case nir_var_mem_global: { - int writemask = instr->const_index[0]; - LLVMValueRef address = get_src(ctx, instr->src[0]); - LLVMValueRef val = get_src(ctx, instr->src[1]); - - unsigned explicit_stride = glsl_get_explicit_stride(deref->type); - unsigned natural_stride = type_scalar_size_bytes(deref->type); - unsigned stride = explicit_stride ? explicit_stride : natural_stride; - int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8; - bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4; - - LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val), - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - - if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 && - stride == natural_stride && !split_stores) { - LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(val), - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - - val = LLVMBuildBitCast(ctx->ac.builder, val, - LLVMGetElementType(LLVMTypeOf(address)), ""); - LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, val, address); - - if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) - LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic); - } else { - LLVMTypeRef val_type = LLVMTypeOf(val); - if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind) - val_type = LLVMGetElementType(val_type); - - LLVMTypeRef ptr_type = LLVMPointerType(val_type, - LLVMGetPointerAddressSpace(LLVMTypeOf(address))); - address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type , ""); - for (unsigned chan = 0; chan < 4; chan++) { - if (!(writemask & (1 << chan))) - continue; - - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, chan * stride / natural_stride, 0); - - LLVMValueRef ptr = ac_build_gep_ptr(&ctx->ac, address, offset); - LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val, - chan); - src = LLVMBuildBitCast(ctx->ac.builder, src, - LLVMGetElementType(LLVMTypeOf(ptr)), ""); - LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, src, ptr); - - if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) - LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic); - } - } - break; - } - default: - abort(); - break; - } - - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7002); + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7002); + } + + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + + LLVMValueRef temp_ptr, value; + int idx = 0; + unsigned comp = 0; + LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1])); + int writemask = instr->const_index[0]; + LLVMValueRef indir_index; + unsigned const_index; + + if (var) { + get_deref_offset(ctx, deref, false, NULL, NULL, &const_index, &indir_index); + idx = var->data.driver_location; + comp = var->data.location_frac; + + if (var->data.compact) { + const_index += comp; + comp = 0; + } + } + + if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64 && + (deref->mode == nir_var_shader_out || deref->mode == nir_var_function_temp)) { + + src = LLVMBuildBitCast(ctx->ac.builder, src, + LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), ""); + + writemask = widen_mask(writemask, 2); + } + + writemask = writemask << comp; + + switch (deref->mode) { + case nir_var_shader_out: + /* TODO: remove this after RADV switches to lowered IO */ + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + LLVMValueRef vertex_index = NULL; + LLVMValueRef indir_index = NULL; + unsigned const_index = 0; + const bool is_patch = var->data.patch || + var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; + + get_deref_offset(ctx, deref, false, NULL, is_patch ? NULL : &vertex_index, &const_index, + &indir_index); + + ctx->abi->store_tcs_outputs(ctx->abi, var, vertex_index, indir_index, const_index, src, + writemask, var->data.location_frac, var->data.driver_location); + break; + } + + for (unsigned chan = 0; chan < 8; chan++) { + int stride = 4; + if (!(writemask & (1 << chan))) + continue; + + value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp); + + if (var->data.compact) + stride = 1; + if (indir_index) { + unsigned count = glsl_count_attribute_slots(var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->abi->outputs + idx + chan, count, stride, true, true); + + tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, value, indir_index, ""); + build_store_values_extended(&ctx->ac, ctx->abi->outputs + idx + chan, count, stride, + tmp_vec); + + } else { + temp_ptr = ctx->abi->outputs[idx + chan + const_index * stride]; + + LLVMBuildStore(ctx->ac.builder, value, temp_ptr); + } + } + break; + case nir_var_function_temp: + for (unsigned chan = 0; chan < 8; chan++) { + if (!(writemask & (1 << chan))) + continue; + + value = ac_llvm_extract_elem(&ctx->ac, src, chan); + if (indir_index) { + unsigned count = glsl_count_attribute_slots(var->type, false); + count -= chan / 4; + LLVMValueRef tmp_vec = ac_build_gather_values_extended( + &ctx->ac, ctx->locals + idx + chan, count, 4, true, true); + + tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, value, indir_index, ""); + build_store_values_extended(&ctx->ac, ctx->locals + idx + chan, count, 4, tmp_vec); + } else { + temp_ptr = ctx->locals[idx + chan + const_index * 4]; + + LLVMBuildStore(ctx->ac.builder, value, temp_ptr); + } + } + break; + + case nir_var_mem_global: { + int writemask = instr->const_index[0]; + LLVMValueRef address = get_src(ctx, instr->src[0]); + LLVMValueRef val = get_src(ctx, instr->src[1]); + + unsigned explicit_stride = glsl_get_explicit_stride(deref->type); + unsigned natural_stride = type_scalar_size_bytes(deref->type); + unsigned stride = explicit_stride ? explicit_stride : natural_stride; + int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8; + bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4; + + LLVMTypeRef ptr_type = + LLVMPointerType(LLVMTypeOf(val), LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, ""); + + if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 && stride == natural_stride && + !split_stores) { + LLVMTypeRef ptr_type = + LLVMPointerType(LLVMTypeOf(val), LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, ""); + + val = LLVMBuildBitCast(ctx->ac.builder, val, LLVMGetElementType(LLVMTypeOf(address)), ""); + LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, val, address); + + if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) + LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic); + } else { + LLVMTypeRef val_type = LLVMTypeOf(val); + if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind) + val_type = LLVMGetElementType(val_type); + + LLVMTypeRef ptr_type = + LLVMPointerType(val_type, LLVMGetPointerAddressSpace(LLVMTypeOf(address))); + address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, ""); + for (unsigned chan = 0; chan < 4; chan++) { + if (!(writemask & (1 << chan))) + continue; + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, chan * stride / natural_stride, 0); + + LLVMValueRef ptr = ac_build_gep_ptr(&ctx->ac, address, offset); + LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val, chan); + src = LLVMBuildBitCast(ctx->ac.builder, src, LLVMGetElementType(LLVMTypeOf(ptr)), ""); + LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, src, ptr); + + if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE)) + LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic); + } + } + break; + } + default: + abort(); + break; + } + + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7002); } -static void -visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) +static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7002); - } - - unsigned base = nir_intrinsic_base(instr); - unsigned writemask = nir_intrinsic_write_mask(instr); - unsigned component = nir_intrinsic_component(instr); - LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0])); - nir_src offset = *nir_get_io_offset_src(instr); - LLVMValueRef indir_index = NULL; - - if (nir_src_is_const(offset)) - assert(nir_src_as_uint(offset) == 0); - else - indir_index = get_src(ctx, offset); - - switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) { - case 32: - break; - case 64: - writemask = widen_mask(writemask, 2); - src = LLVMBuildBitCast(ctx->ac.builder, src, - LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), - ""); - break; - default: - unreachable("unhandled store_output bit size"); - return; - } - - writemask <<= component; - - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); - LLVMValueRef vertex_index = - vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL; - - ctx->abi->store_tcs_outputs(ctx->abi, NULL, - vertex_index, indir_index, - 0, src, writemask, - component, base * 4); - return; - } - - /* No indirect indexing is allowed after this point. */ - assert(!indir_index); - - for (unsigned chan = 0; chan < 8; chan++) { - if (!(writemask & (1 << chan))) - continue; - - LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); - LLVMBuildStore(ctx->ac.builder, value, - ctx->abi->outputs[base * 4 + chan]); - } - - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7002); + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7002); + } + + unsigned base = nir_intrinsic_base(instr); + unsigned writemask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0])); + nir_src offset = *nir_get_io_offset_src(instr); + LLVMValueRef indir_index = NULL; + + if (nir_src_is_const(offset)) + assert(nir_src_as_uint(offset) == 0); + else + indir_index = get_src(ctx, offset); + + switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) { + case 32: + break; + case 64: + writemask = widen_mask(writemask, 2); + src = LLVMBuildBitCast(ctx->ac.builder, src, + LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), ""); + break; + default: + unreachable("unhandled store_output bit size"); + return; + } + + writemask <<= component; + + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL; + + ctx->abi->store_tcs_outputs(ctx->abi, NULL, vertex_index, indir_index, 0, src, writemask, + component, base * 4); + return; + } + + /* No indirect indexing is allowed after this point. */ + assert(!indir_index); + + for (unsigned chan = 0; chan < 8; chan++) { + if (!(writemask & (1 << chan))) + continue; + + LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); + LLVMBuildStore(ctx->ac.builder, value, ctx->abi->outputs[base * 4 + chan]); + } + + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7002); } static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) { - switch (dim) { - case GLSL_SAMPLER_DIM_BUF: - return 1; - case GLSL_SAMPLER_DIM_1D: - return array ? 2 : 1; - case GLSL_SAMPLER_DIM_2D: - return array ? 3 : 2; - case GLSL_SAMPLER_DIM_MS: - return array ? 4 : 3; - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - return 3; - case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_SUBPASS: - return 2; - case GLSL_SAMPLER_DIM_SUBPASS_MS: - return 3; - default: - break; - } - return 0; + switch (dim) { + case GLSL_SAMPLER_DIM_BUF: + return 1; + case GLSL_SAMPLER_DIM_1D: + return array ? 2 : 1; + case GLSL_SAMPLER_DIM_2D: + return array ? 3 : 2; + case GLSL_SAMPLER_DIM_MS: + return array ? 4 : 3; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + return 3; + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_SUBPASS: + return 2; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return 3; + default: + break; + } + return 0; } static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx, - LLVMValueRef coord_x, LLVMValueRef coord_y, - LLVMValueRef coord_z, - LLVMValueRef sample_index, - LLVMValueRef fmask_desc_ptr) + LLVMValueRef coord_x, LLVMValueRef coord_y, + LLVMValueRef coord_z, LLVMValueRef sample_index, + LLVMValueRef fmask_desc_ptr) { - unsigned sample_chan = coord_z ? 3 : 2; - LLVMValueRef addr[4] = {coord_x, coord_y, coord_z}; - addr[sample_chan] = sample_index; + unsigned sample_chan = coord_z ? 3 : 2; + LLVMValueRef addr[4] = {coord_x, coord_y, coord_z}; + addr[sample_chan] = sample_index; - ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL); - return addr[sample_chan]; + ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL); + return addr[sample_chan]; } static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr) { - assert(instr->src[0].is_ssa); - return nir_instr_as_deref(instr->src[0].ssa->parent_instr); + assert(instr->src[0].is_ssa); + return nir_instr_as_deref(instr->src[0].ssa->parent_instr); } static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, LLVMValueRef dynamic_index, - enum ac_descriptor_type desc_type, - bool write) + enum ac_descriptor_type desc_type, bool write) { - nir_deref_instr *deref_instr = - instr->src[0].ssa->parent_instr->type == nir_instr_type_deref ? - nir_instr_as_deref(instr->src[0].ssa->parent_instr) : NULL; + nir_deref_instr *deref_instr = instr->src[0].ssa->parent_instr->type == nir_instr_type_deref + ? nir_instr_as_deref(instr->src[0].ssa->parent_instr) + : NULL; - return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write); + return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write); } -static void get_image_coords(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - LLVMValueRef dynamic_desc_index, - struct ac_image_args *args, - enum glsl_sampler_dim dim, - bool is_array) +static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, + LLVMValueRef dynamic_desc_index, struct ac_image_args *args, + enum glsl_sampler_dim dim, bool is_array) { - LLVMValueRef src0 = get_src(ctx, instr->src[1]); - LLVMValueRef masks[] = { - LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false), - LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), - }; - LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); - - int count; - ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || - dim == GLSL_SAMPLER_DIM_SUBPASS_MS); - bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || - dim == GLSL_SAMPLER_DIM_SUBPASS_MS); - bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; - assert(!add_frag_pos && "Input attachments should be lowered by this point."); - count = image_type_to_components_count(dim, is_array); - - if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load || - instr->intrinsic == nir_intrinsic_bindless_image_load)) { - LLVMValueRef fmask_load_address[3]; - - fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); - fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], ""); - if (is_array) - fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], ""); - else - fmask_load_address[2] = NULL; - - sample_index = adjust_sample_index_using_fmask(&ctx->ac, - fmask_load_address[0], - fmask_load_address[1], - fmask_load_address[2], - sample_index, - get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), - AC_DESC_FMASK, &instr->instr, dynamic_desc_index, true, false)); - } - if (count == 1 && !gfx9_1d) { - if (instr->src[1].ssa->num_components) - args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); - else - args->coords[0] = src0; - } else { - int chan; - if (is_ms) - count--; - for (chan = 0; chan < count; ++chan) { - args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan); - } - - if (gfx9_1d) { - if (is_array) { - args->coords[2] = args->coords[1]; - args->coords[1] = ctx->ac.i32_0; - } else - args->coords[1] = ctx->ac.i32_0; - count++; - } - if (ctx->ac.chip_class == GFX9 && - dim == GLSL_SAMPLER_DIM_2D && - !is_array) { - /* The hw can't bind a slice of a 3D image as a 2D - * image, because it ignores BASE_ARRAY if the target - * is 3D. The workaround is to read BASE_ARRAY and set - * it as the 3rd address operand for all 2D images. - */ - LLVMValueRef first_layer, const5, mask; - - const5 = LLVMConstInt(ctx->ac.i32, 5, 0); - mask = LLVMConstInt(ctx->ac.i32, S_008F24_BASE_ARRAY(~0), 0); - first_layer = LLVMBuildExtractElement(ctx->ac.builder, args->resource, const5, ""); - first_layer = LLVMBuildAnd(ctx->ac.builder, first_layer, mask, ""); - - args->coords[count] = first_layer; - count++; - } - - - if (is_ms) { - args->coords[count] = sample_index; - count++; - } - } + LLVMValueRef src0 = get_src(ctx, instr->src[1]); + LLVMValueRef masks[] = { + LLVMConstInt(ctx->ac.i32, 0, false), + LLVMConstInt(ctx->ac.i32, 1, false), + LLVMConstInt(ctx->ac.i32, 2, false), + LLVMConstInt(ctx->ac.i32, 3, false), + }; + LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); + + int count; + ASSERTED bool add_frag_pos = + (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; + assert(!add_frag_pos && "Input attachments should be lowered by this point."); + count = image_type_to_components_count(dim, is_array); + + if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load || + instr->intrinsic == nir_intrinsic_bindless_image_load)) { + LLVMValueRef fmask_load_address[3]; + + fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); + fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], ""); + if (is_array) + fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], ""); + else + fmask_load_address[2] = NULL; + + sample_index = adjust_sample_index_using_fmask( + &ctx->ac, fmask_load_address[0], fmask_load_address[1], fmask_load_address[2], + sample_index, + get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), AC_DESC_FMASK, + &instr->instr, dynamic_desc_index, true, false)); + } + if (count == 1 && !gfx9_1d) { + if (instr->src[1].ssa->num_components) + args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); + else + args->coords[0] = src0; + } else { + int chan; + if (is_ms) + count--; + for (chan = 0; chan < count; ++chan) { + args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan); + } + + if (gfx9_1d) { + if (is_array) { + args->coords[2] = args->coords[1]; + args->coords[1] = ctx->ac.i32_0; + } else + args->coords[1] = ctx->ac.i32_0; + count++; + } + if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_2D && !is_array) { + /* The hw can't bind a slice of a 3D image as a 2D + * image, because it ignores BASE_ARRAY if the target + * is 3D. The workaround is to read BASE_ARRAY and set + * it as the 3rd address operand for all 2D images. + */ + LLVMValueRef first_layer, const5, mask; + + const5 = LLVMConstInt(ctx->ac.i32, 5, 0); + mask = LLVMConstInt(ctx->ac.i32, S_008F24_BASE_ARRAY(~0), 0); + first_layer = LLVMBuildExtractElement(ctx->ac.builder, args->resource, const5, ""); + first_layer = LLVMBuildAnd(ctx->ac.builder, first_layer, mask, ""); + + args->coords[count] = first_layer; + count++; + } + + if (is_ms) { + args->coords[count] = sample_index; + count++; + } + } } static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, - LLVMValueRef dynamic_index, - bool write, bool atomic) + LLVMValueRef dynamic_index, bool write, bool atomic) { - LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write); - if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) { - LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), ""); - stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), ""); - - LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->ac.builder, - LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""), - elem_count, stride, ""); - - rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count, - LLVMConstInt(ctx->ac.i32, 2, 0), ""); - } - return rsrc; + LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write); + if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) { + LLVMValueRef elem_count = + LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + LLVMValueRef stride = + LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), ""); + stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), ""); + + LLVMValueRef new_elem_count = LLVMBuildSelect( + ctx->ac.builder, LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""), + elem_count, stride, ""); + + rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count, + LLVMConstInt(ctx->ac.i32, 2, 0), ""); + } + return rsrc; } static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx, - struct waterfall_context *wctx, - const nir_intrinsic_instr *instr) + struct waterfall_context *wctx, + const nir_intrinsic_instr *instr) { - nir_deref_instr *deref_instr = NULL; + nir_deref_instr *deref_instr = NULL; - if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) - deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) + deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); - LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true); - return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); + LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true); + return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM); } -static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) +static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, + bool bindless) { - LLVMValueRef res; - - enum glsl_sampler_dim dim; - enum gl_access_qualifier access = nir_intrinsic_access(instr); - bool is_array; - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const nir_deref_instr *image_deref = get_image_deref(instr); - const struct glsl_type *type = image_deref->type; - const nir_variable *var = nir_deref_instr_get_variable(image_deref); - dim = glsl_get_sampler_dim(type); - access |= var->data.access; - is_array = glsl_sampler_type_is_array(type); - } - - struct waterfall_context wctx; - LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); - - struct ac_image_args args = {}; - - args.cache_policy = get_cache_policy(ctx, access, false, false); - - if (dim == GLSL_SAMPLER_DIM_BUF) { - unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); - unsigned num_channels = util_last_bit(mask); - LLVMValueRef rsrc, vindex; - - rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false); - vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), - ctx->ac.i32_0, ""); - - assert(instr->dest.is_ssa); - bool can_speculate = access & ACCESS_CAN_REORDER; - res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, - ctx->ac.i32_0, num_channels, - args.cache_policy, - can_speculate, - instr->dest.ssa.bit_size == 16); - res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels); - - res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components); - res = ac_to_integer(&ctx->ac, res); - } else { - bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; - - args.opcode = level_zero ? ac_image_load : ac_image_load_mip; - args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); - get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); - args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); - if (!level_zero) - args.lod = get_src(ctx, instr->src[3]); - args.dmask = 15; - args.attributes = AC_FUNC_ATTR_READONLY; - - assert(instr->dest.is_ssa); - args.d16 = instr->dest.ssa.bit_size == 16; - - res = ac_build_image_opcode(&ctx->ac, &args); - } - return exit_waterfall(ctx, &wctx, res); + LLVMValueRef res; + + enum glsl_sampler_dim dim; + enum gl_access_qualifier access = nir_intrinsic_access(instr); + bool is_array; + if (bindless) { + dim = nir_intrinsic_image_dim(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const nir_deref_instr *image_deref = get_image_deref(instr); + const struct glsl_type *type = image_deref->type; + const nir_variable *var = nir_deref_instr_get_variable(image_deref); + dim = glsl_get_sampler_dim(type); + access |= var->data.access; + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + struct ac_image_args args = {}; + + args.cache_policy = get_cache_policy(ctx, access, false, false); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_channels = util_last_bit(mask); + LLVMValueRef rsrc, vindex; + + rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false); + vindex = + LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, ""); + + assert(instr->dest.is_ssa); + bool can_speculate = access & ACCESS_CAN_REORDER; + res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels, + args.cache_policy, can_speculate, + instr->dest.ssa.bit_size == 16); + res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels); + + res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components); + res = ac_to_integer(&ctx->ac, res); + } else { + bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; + + args.opcode = level_zero ? ac_image_load : ac_image_load_mip; + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); + get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + if (!level_zero) + args.lod = get_src(ctx, instr->src[3]); + args.dmask = 15; + args.attributes = AC_FUNC_ATTR_READONLY; + + assert(instr->dest.is_ssa); + args.d16 = instr->dest.ssa.bit_size == 16; + + res = ac_build_image_opcode(&ctx->ac, &args); + } + return exit_waterfall(ctx, &wctx, res); } -static void visit_image_store(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) +static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, + bool bindless) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7003); - } - - enum glsl_sampler_dim dim; - enum gl_access_qualifier access = nir_intrinsic_access(instr); - bool is_array; - - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const nir_deref_instr *image_deref = get_image_deref(instr); - const struct glsl_type *type = image_deref->type; - const nir_variable *var = nir_deref_instr_get_variable(image_deref); - dim = glsl_get_sampler_dim(type); - access |= var->data.access; - is_array = glsl_sampler_type_is_array(type); - } - - struct waterfall_context wctx; - LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); - - bool writeonly_memory = access & ACCESS_NON_READABLE; - struct ac_image_args args = {}; - - args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory); - - if (dim == GLSL_SAMPLER_DIM_BUF) { - LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false); - LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); - unsigned src_channels = ac_get_llvm_num_components(src); - LLVMValueRef vindex; - - if (src_channels == 3) - src = ac_build_expand_to_vec4(&ctx->ac, src, 3); - - vindex = LLVMBuildExtractElement(ctx->ac.builder, - get_src(ctx, instr->src[1]), - ctx->ac.i32_0, ""); - - ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, - ctx->ac.i32_0, args.cache_policy); - } else { - bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; - - args.opcode = level_zero ? ac_image_store : ac_image_store_mip; - args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); - args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true); - get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); - args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); - if (!level_zero) - args.lod = get_src(ctx, instr->src[4]); - args.dmask = 15; - args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16; - - ac_build_image_opcode(&ctx->ac, &args); - } - - exit_waterfall(ctx, &wctx, NULL); - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7003); + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7003); + } + + enum glsl_sampler_dim dim; + enum gl_access_qualifier access = nir_intrinsic_access(instr); + bool is_array; + + if (bindless) { + dim = nir_intrinsic_image_dim(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const nir_deref_instr *image_deref = get_image_deref(instr); + const struct glsl_type *type = image_deref->type; + const nir_variable *var = nir_deref_instr_get_variable(image_deref); + dim = glsl_get_sampler_dim(type); + access |= var->data.access; + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + bool writeonly_memory = access & ACCESS_NON_READABLE; + struct ac_image_args args = {}; + + args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false); + LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); + unsigned src_channels = ac_get_llvm_num_components(src); + LLVMValueRef vindex; + + if (src_channels == 3) + src = ac_build_expand_to_vec4(&ctx->ac, src, 3); + + vindex = + LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, ""); + + ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.cache_policy); + } else { + bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; + + args.opcode = level_zero ? ac_image_store : ac_image_store_mip; + args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true); + get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + if (!level_zero) + args.lod = get_src(ctx, instr->src[4]); + args.dmask = 15; + args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16; + + ac_build_image_opcode(&ctx->ac, &args); + } + + exit_waterfall(ctx, &wctx, NULL); + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7003); } -static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) +static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, + bool bindless) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7004); - } - - LLVMValueRef params[7]; - int param_count = 0; - - bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap || - instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap; - const char *atomic_name; - char intrinsic_name[64]; - enum ac_atomic_op atomic_subop; - ASSERTED int length; - - enum glsl_sampler_dim dim; - bool is_array; - if (bindless) { - if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_imin || - instr->intrinsic == nir_intrinsic_bindless_image_atomic_umin || - instr->intrinsic == nir_intrinsic_bindless_image_atomic_imax || - instr->intrinsic == nir_intrinsic_bindless_image_atomic_umax) { - ASSERTED const GLenum format = nir_intrinsic_format(instr); - assert(format == GL_R32UI || format == GL_R32I); - } - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const struct glsl_type *type = get_image_deref(instr)->type; - dim = glsl_get_sampler_dim(type); - is_array = glsl_sampler_type_is_array(type); - } - - struct waterfall_context wctx; - LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); - - switch (instr->intrinsic) { - case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_image_deref_atomic_add: - atomic_name = "add"; - atomic_subop = ac_atomic_add; - break; - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_image_deref_atomic_imin: - atomic_name = "smin"; - atomic_subop = ac_atomic_smin; - break; - case nir_intrinsic_bindless_image_atomic_umin: - case nir_intrinsic_image_deref_atomic_umin: - atomic_name = "umin"; - atomic_subop = ac_atomic_umin; - break; - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_image_deref_atomic_imax: - atomic_name = "smax"; - atomic_subop = ac_atomic_smax; - break; - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_image_deref_atomic_umax: - atomic_name = "umax"; - atomic_subop = ac_atomic_umax; - break; - case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_image_deref_atomic_and: - atomic_name = "and"; - atomic_subop = ac_atomic_and; - break; - case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_image_deref_atomic_or: - atomic_name = "or"; - atomic_subop = ac_atomic_or; - break; - case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_image_deref_atomic_xor: - atomic_name = "xor"; - atomic_subop = ac_atomic_xor; - break; - case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_image_deref_atomic_exchange: - atomic_name = "swap"; - atomic_subop = ac_atomic_swap; - break; - case nir_intrinsic_bindless_image_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_comp_swap: - atomic_name = "cmpswap"; - atomic_subop = 0; /* not used */ - break; - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_image_deref_atomic_inc_wrap: { - atomic_name = "inc"; - atomic_subop = ac_atomic_inc_wrap; - break; - } - case nir_intrinsic_bindless_image_atomic_dec_wrap: - case nir_intrinsic_image_deref_atomic_dec_wrap: - atomic_name = "dec"; - atomic_subop = ac_atomic_dec_wrap; - break; - default: - abort(); - } - - if (cmpswap) - params[param_count++] = get_src(ctx, instr->src[4]); - params[param_count++] = get_src(ctx, instr->src[3]); - - LLVMValueRef result; - if (dim == GLSL_SAMPLER_DIM_BUF) { - params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true); - params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), - ctx->ac.i32_0, ""); /* vindex */ - params[param_count++] = ctx->ac.i32_0; /* voffset */ - if (LLVM_VERSION_MAJOR >= 9) { - /* XXX: The new raw/struct atomic intrinsics are buggy - * with LLVM 8, see r358579. - */ - params[param_count++] = ctx->ac.i32_0; /* soffset */ - params[param_count++] = ctx->ac.i32_0; /* slc */ - - length = snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name); - } else { - params[param_count++] = ctx->ac.i1false; /* slc */ - - length = snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.buffer.atomic.%s", atomic_name); - } - - assert(length < sizeof(intrinsic_name)); - result = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, - params, param_count, 0); - } else { - struct ac_image_args args = {}; - args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic; - args.atomic = atomic_subop; - args.data[0] = params[0]; - if (cmpswap) - args.data[1] = params[1]; - args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true); - get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); - args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); - - result = ac_build_image_opcode(&ctx->ac, &args); - } - - result = exit_waterfall(ctx, &wctx, result); - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7004); - return result; + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7004); + } + + LLVMValueRef params[7]; + int param_count = 0; + + bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap; + const char *atomic_name; + char intrinsic_name[64]; + enum ac_atomic_op atomic_subop; + ASSERTED int length; + + enum glsl_sampler_dim dim; + bool is_array; + if (bindless) { + if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_imin || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_umin || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_imax || + instr->intrinsic == nir_intrinsic_bindless_image_atomic_umax) { + ASSERTED const GLenum format = nir_intrinsic_format(instr); + assert(format == GL_R32UI || format == GL_R32I); + } + dim = nir_intrinsic_image_dim(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const struct glsl_type *type = get_image_deref(instr)->type; + dim = glsl_get_sampler_dim(type); + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_image_deref_atomic_add: + atomic_name = "add"; + atomic_subop = ac_atomic_add; + break; + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_image_deref_atomic_imin: + atomic_name = "smin"; + atomic_subop = ac_atomic_smin; + break; + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_image_deref_atomic_umin: + atomic_name = "umin"; + atomic_subop = ac_atomic_umin; + break; + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_image_deref_atomic_imax: + atomic_name = "smax"; + atomic_subop = ac_atomic_smax; + break; + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_image_deref_atomic_umax: + atomic_name = "umax"; + atomic_subop = ac_atomic_umax; + break; + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_image_deref_atomic_and: + atomic_name = "and"; + atomic_subop = ac_atomic_and; + break; + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_image_deref_atomic_or: + atomic_name = "or"; + atomic_subop = ac_atomic_or; + break; + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_image_deref_atomic_xor: + atomic_name = "xor"; + atomic_subop = ac_atomic_xor; + break; + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_image_deref_atomic_exchange: + atomic_name = "swap"; + atomic_subop = ac_atomic_swap; + break; + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_comp_swap: + atomic_name = "cmpswap"; + atomic_subop = 0; /* not used */ + break; + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_image_deref_atomic_inc_wrap: { + atomic_name = "inc"; + atomic_subop = ac_atomic_inc_wrap; + break; + } + case nir_intrinsic_bindless_image_atomic_dec_wrap: + case nir_intrinsic_image_deref_atomic_dec_wrap: + atomic_name = "dec"; + atomic_subop = ac_atomic_dec_wrap; + break; + default: + abort(); + } + + if (cmpswap) + params[param_count++] = get_src(ctx, instr->src[4]); + params[param_count++] = get_src(ctx, instr->src[3]); + + LLVMValueRef result; + if (dim == GLSL_SAMPLER_DIM_BUF) { + params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true); + params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), + ctx->ac.i32_0, ""); /* vindex */ + params[param_count++] = ctx->ac.i32_0; /* voffset */ + if (LLVM_VERSION_MAJOR >= 9) { + /* XXX: The new raw/struct atomic intrinsics are buggy + * with LLVM 8, see r358579. + */ + params[param_count++] = ctx->ac.i32_0; /* soffset */ + params[param_count++] = ctx->ac.i32_0; /* slc */ + + length = snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name); + } else { + params[param_count++] = ctx->ac.i1false; /* slc */ + + length = snprintf(intrinsic_name, sizeof(intrinsic_name), "llvm.amdgcn.buffer.atomic.%s", + atomic_name); + } + + assert(length < sizeof(intrinsic_name)); + result = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, params, param_count, 0); + } else { + struct ac_image_args args = {}; + args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic; + args.atomic = atomic_subop; + args.data[0] = params[0]; + if (cmpswap) + args.data[1] = params[1]; + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true); + get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array); + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + + result = ac_build_image_opcode(&ctx->ac, &args); + } + + result = exit_waterfall(ctx, &wctx, result); + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7004); + return result; } -static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - struct waterfall_context wctx; - LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); - LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); - LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc); + LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc); - return exit_waterfall(ctx, &wctx, ret); + return exit_waterfall(ctx, &wctx, ret); } -static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - bool bindless) +static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, + bool bindless) { - LLVMValueRef res; - - enum glsl_sampler_dim dim; - bool is_array; - if (bindless) { - dim = nir_intrinsic_image_dim(instr); - is_array = nir_intrinsic_image_array(instr); - } else { - const struct glsl_type *type = get_image_deref(instr)->type; - dim = glsl_get_sampler_dim(type); - is_array = glsl_sampler_type_is_array(type); - } - - struct waterfall_context wctx; - LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); - - if (dim == GLSL_SAMPLER_DIM_BUF) { - res = get_buffer_size(ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true); - } else { - - struct ac_image_args args = { 0 }; - - args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); - args.dmask = 0xf; - args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); - args.opcode = ac_image_get_resinfo; - assert(nir_src_as_uint(instr->src[1]) == 0); - args.lod = ctx->ac.i32_0; - args.attributes = AC_FUNC_ATTR_READNONE; - - res = ac_build_image_opcode(&ctx->ac, &args); - - LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); - - if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) { - LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); - LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); - z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); - res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, ""); - } - - if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) { - LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); - res = LLVMBuildInsertElement(ctx->ac.builder, res, layers, - ctx->ac.i32_1, ""); - } - } - return exit_waterfall(ctx, &wctx, res); + LLVMValueRef res; + + enum glsl_sampler_dim dim; + bool is_array; + if (bindless) { + dim = nir_intrinsic_image_dim(instr); + is_array = nir_intrinsic_image_array(instr); + } else { + const struct glsl_type *type = get_image_deref(instr)->type; + dim = glsl_get_sampler_dim(type); + is_array = glsl_sampler_type_is_array(type); + } + + struct waterfall_context wctx; + LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + res = get_buffer_size( + ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true); + } else { + + struct ac_image_args args = {0}; + + args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array); + args.dmask = 0xf; + args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false); + args.opcode = ac_image_get_resinfo; + assert(nir_src_as_uint(instr->src[1]) == 0); + args.lod = ctx->ac.i32_0; + args.attributes = AC_FUNC_ATTR_READNONE; + + res = ac_build_image_opcode(&ctx->ac, &args); + + LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); + + if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) { + LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); + LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); + z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); + res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, ""); + } + + if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) { + LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, ""); + res = LLVMBuildInsertElement(ctx->ac.builder, res, layers, ctx->ac.i32_1, ""); + } + } + return exit_waterfall(ctx, &wctx, res); } -static void emit_membar(struct ac_llvm_context *ac, - const nir_intrinsic_instr *instr) +static void emit_membar(struct ac_llvm_context *ac, const nir_intrinsic_instr *instr) { - unsigned wait_flags = 0; - - switch (instr->intrinsic) { - case nir_intrinsic_memory_barrier: - case nir_intrinsic_group_memory_barrier: - wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE; - break; - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE; - break; - case nir_intrinsic_memory_barrier_shared: - wait_flags = AC_WAIT_LGKM; - break; - default: - break; - } - - ac_build_waitcnt(ac, wait_flags); + unsigned wait_flags = 0; + + switch (instr->intrinsic) { + case nir_intrinsic_memory_barrier: + case nir_intrinsic_group_memory_barrier: + wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE; + break; + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE; + break; + case nir_intrinsic_memory_barrier_shared: + wait_flags = AC_WAIT_LGKM; + break; + default: + break; + } + + ac_build_waitcnt(ac, wait_flags); } void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage) { - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) { - ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); - return; - } - ac_build_s_barrier(ac); + /* GFX6 only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch + * always fits into a single wave. + */ + if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) { + ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); + return; + } + ac_build_s_barrier(ac); } -static void emit_discard(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) +static void emit_discard(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef cond; - - if (instr->intrinsic == nir_intrinsic_discard_if) { - cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - get_src(ctx, instr->src[0]), - ctx->ac.i32_0, ""); - } else { - assert(instr->intrinsic == nir_intrinsic_discard); - cond = ctx->ac.i1false; - } - - ac_build_kill_if_false(&ctx->ac, cond); + LLVMValueRef cond; + + if (instr->intrinsic == nir_intrinsic_discard_if) { + cond = + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, get_src(ctx, instr->src[0]), ctx->ac.i32_0, ""); + } else { + assert(instr->intrinsic == nir_intrinsic_discard); + cond = ctx->ac.i1false; + } + + ac_build_kill_if_false(&ctx->ac, cond); } -static void emit_demote(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) +static void emit_demote(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef cond; - - if (instr->intrinsic == nir_intrinsic_demote_if) { - cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - get_src(ctx, instr->src[0]), - ctx->ac.i32_0, ""); - } else { - assert(instr->intrinsic == nir_intrinsic_demote); - cond = ctx->ac.i1false; - } - - /* Kill immediately while maintaining WQM. */ - ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond)); - - LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); - mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, ""); - LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill); - return; + LLVMValueRef cond; + + if (instr->intrinsic == nir_intrinsic_demote_if) { + cond = + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, get_src(ctx, instr->src[0]), ctx->ac.i32_0, ""); + } else { + assert(instr->intrinsic == nir_intrinsic_demote); + cond = ctx->ac.i1false; + } + + /* Kill immediately while maintaining WQM. */ + ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond)); + + LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, ""); + LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill); + return; } -static LLVMValueRef -visit_load_local_invocation_index(struct ac_nir_context *ctx) +static LLVMValueRef visit_load_local_invocation_index(struct ac_nir_context *ctx) { - LLVMValueRef result; - LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac); - result = LLVMBuildAnd(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->tg_size), - LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); + LLVMValueRef result; + LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac); + result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size), + LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); - if (ctx->ac.wave_size == 32) - result = LLVMBuildLShr(ctx->ac.builder, result, - LLVMConstInt(ctx->ac.i32, 1, false), ""); + if (ctx->ac.wave_size == 32) + result = LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 1, false), ""); - return LLVMBuildAdd(ctx->ac.builder, result, thread_id, ""); + return LLVMBuildAdd(ctx->ac.builder, result, thread_id, ""); } -static LLVMValueRef -visit_load_subgroup_id(struct ac_nir_context *ctx) +static LLVMValueRef visit_load_subgroup_id(struct ac_nir_context *ctx) { - if (ctx->stage == MESA_SHADER_COMPUTE) { - LLVMValueRef result; - result = LLVMBuildAnd(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->tg_size), - LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); - return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), ""); - } else { - return LLVMConstInt(ctx->ac.i32, 0, false); - } + if (ctx->stage == MESA_SHADER_COMPUTE) { + LLVMValueRef result; + result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size), + LLVMConstInt(ctx->ac.i32, 0xfc0, false), ""); + return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), ""); + } else { + return LLVMConstInt(ctx->ac.i32, 0, false); + } } -static LLVMValueRef -visit_load_num_subgroups(struct ac_nir_context *ctx) +static LLVMValueRef visit_load_num_subgroups(struct ac_nir_context *ctx) { - if (ctx->stage == MESA_SHADER_COMPUTE) { - return LLVMBuildAnd(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->tg_size), - LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); - } else { - return LLVMConstInt(ctx->ac.i32, 1, false); - } + if (ctx->stage == MESA_SHADER_COMPUTE) { + return LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size), + LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); + } else { + return LLVMConstInt(ctx->ac.i32, 1, false); + } } -static LLVMValueRef -visit_first_invocation(struct ac_nir_context *ctx) +static LLVMValueRef visit_first_invocation(struct ac_nir_context *ctx) { - LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1); - const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64"; + LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1); + const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64"; - /* The second argument is whether cttz(0) should be defined, but we do not care. */ - LLVMValueRef args[] = {active_set, ctx->ac.i1false}; - LLVMValueRef result = ac_build_intrinsic(&ctx->ac, intr, - ctx->ac.iN_wavemask, args, 2, - AC_FUNC_ATTR_NOUNWIND | - AC_FUNC_ATTR_READNONE); + /* The second argument is whether cttz(0) should be defined, but we do not care. */ + LLVMValueRef args[] = {active_set, ctx->ac.i1false}; + LLVMValueRef result = ac_build_intrinsic(&ctx->ac, intr, ctx->ac.iN_wavemask, args, 2, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE); - return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, ""); + return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, ""); } -static LLVMValueRef -visit_load_shared(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) +static LLVMValueRef visit_load_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef values[4], derived_ptr, index, ret; + LLVMValueRef values[4], derived_ptr, index, ret; - LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], - instr->dest.ssa.bit_size); + LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->dest.ssa.bit_size); - for (int chan = 0; chan < instr->num_components; chan++) { - index = LLVMConstInt(ctx->ac.i32, chan, 0); - derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); - values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, ""); - } + for (int chan = 0; chan < instr->num_components; chan++) { + index = LLVMConstInt(ctx->ac.i32, chan, 0); + derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, ""); + values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, ""); + } - ret = ac_build_gather_values(&ctx->ac, values, instr->num_components); - return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); + ret = ac_build_gather_values(&ctx->ac, values, instr->num_components); + return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), ""); } -static void -visit_store_shared(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr) +static void visit_store_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr) { - LLVMValueRef derived_ptr, data,index; - LLVMBuilderRef builder = ctx->ac.builder; - - LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1], - instr->src[0].ssa->bit_size); - LLVMValueRef src = get_src(ctx, instr->src[0]); - - int writemask = nir_intrinsic_write_mask(instr); - for (int chan = 0; chan < 4; chan++) { - if (!(writemask & (1 << chan))) { - continue; - } - data = ac_llvm_extract_elem(&ctx->ac, src, chan); - index = LLVMConstInt(ctx->ac.i32, chan, 0); - derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); - LLVMBuildStore(builder, data, derived_ptr); - } + LLVMValueRef derived_ptr, data, index; + LLVMBuilderRef builder = ctx->ac.builder; + + LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1], instr->src[0].ssa->bit_size); + LLVMValueRef src = get_src(ctx, instr->src[0]); + + int writemask = nir_intrinsic_write_mask(instr); + for (int chan = 0; chan < 4; chan++) { + if (!(writemask & (1 << chan))) { + continue; + } + data = ac_llvm_extract_elem(&ctx->ac, src, chan); + index = LLVMConstInt(ctx->ac.i32, chan, 0); + derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); + LLVMBuildStore(builder, data, derived_ptr); + } } -static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, - LLVMValueRef ptr, int src_idx) +static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr, + LLVMValueRef ptr, int src_idx) { - if (ctx->ac.postponed_kill) { - LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, - ctx->ac.postponed_kill, ""); - ac_build_ifcc(&ctx->ac, cond, 7005); - } - - LLVMValueRef result; - LLVMValueRef src = get_src(ctx, instr->src[src_idx]); - - const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; - - if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) { - nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); - if (deref->mode == nir_var_mem_global) { - /* use "singlethread" sync scope to implement relaxed ordering */ - sync_scope = LLVM_VERSION_MAJOR >= 9 ? "singlethread-one-as" : "singlethread"; - - LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(src), LLVMGetPointerAddressSpace(LLVMTypeOf(ptr))); - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type , ""); - } - } - - if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap || - instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) { - LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]); - result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope); - result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); - } else { - LLVMAtomicRMWBinOp op; - switch (instr->intrinsic) { - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_deref_atomic_add: - op = LLVMAtomicRMWBinOpAdd; - break; - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_deref_atomic_umin: - op = LLVMAtomicRMWBinOpUMin; - break; - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_deref_atomic_umax: - op = LLVMAtomicRMWBinOpUMax; - break; - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_deref_atomic_imin: - op = LLVMAtomicRMWBinOpMin; - break; - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_deref_atomic_imax: - op = LLVMAtomicRMWBinOpMax; - break; - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_deref_atomic_and: - op = LLVMAtomicRMWBinOpAnd; - break; - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_deref_atomic_or: - op = LLVMAtomicRMWBinOpOr; - break; - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_deref_atomic_xor: - op = LLVMAtomicRMWBinOpXor; - break; - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_deref_atomic_exchange: - op = LLVMAtomicRMWBinOpXchg; - break; + if (ctx->ac.postponed_kill) { + LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, ""); + ac_build_ifcc(&ctx->ac, cond, 7005); + } + + LLVMValueRef result; + LLVMValueRef src = get_src(ctx, instr->src[src_idx]); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) { + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + if (deref->mode == nir_var_mem_global) { + /* use "singlethread" sync scope to implement relaxed ordering */ + sync_scope = LLVM_VERSION_MAJOR >= 9 ? "singlethread-one-as" : "singlethread"; + + LLVMTypeRef ptr_type = + LLVMPointerType(LLVMTypeOf(src), LLVMGetPointerAddressSpace(LLVMTypeOf(ptr))); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, ""); + } + } + + if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap || + instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) { + LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]); + result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope); + result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, ""); + } else { + LLVMAtomicRMWBinOp op; + switch (instr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_deref_atomic_add: + op = LLVMAtomicRMWBinOpAdd; + break; + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_deref_atomic_umin: + op = LLVMAtomicRMWBinOpUMin; + break; + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_deref_atomic_umax: + op = LLVMAtomicRMWBinOpUMax; + break; + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_deref_atomic_imin: + op = LLVMAtomicRMWBinOpMin; + break; + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_deref_atomic_imax: + op = LLVMAtomicRMWBinOpMax; + break; + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_deref_atomic_and: + op = LLVMAtomicRMWBinOpAnd; + break; + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_deref_atomic_or: + op = LLVMAtomicRMWBinOpOr; + break; + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_deref_atomic_xor: + op = LLVMAtomicRMWBinOpXor; + break; + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_deref_atomic_exchange: + op = LLVMAtomicRMWBinOpXchg; + break; #if LLVM_VERSION_MAJOR >= 10 - case nir_intrinsic_shared_atomic_fadd: - case nir_intrinsic_deref_atomic_fadd: - op = LLVMAtomicRMWBinOpFAdd; - break; + case nir_intrinsic_shared_atomic_fadd: + case nir_intrinsic_deref_atomic_fadd: + op = LLVMAtomicRMWBinOpFAdd; + break; #endif - default: - return NULL; - } + default: + return NULL; + } - LLVMValueRef val; + LLVMValueRef val; - if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd || - instr->intrinsic == nir_intrinsic_deref_atomic_fadd) { - val = ac_to_float(&ctx->ac, src); - } else { - val = ac_to_integer(&ctx->ac, src); - } + if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd || + instr->intrinsic == nir_intrinsic_deref_atomic_fadd) { + val = ac_to_float(&ctx->ac, src); + } else { + val = ac_to_integer(&ctx->ac, src); + } - result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope); - } + result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope); + } - if (ctx->ac.postponed_kill) - ac_build_endif(&ctx->ac, 7005); - return result; + if (ctx->ac.postponed_kill) + ac_build_endif(&ctx->ac, 7005); + return result; } static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx) { - LLVMValueRef values[2]; - LLVMValueRef pos[2]; + LLVMValueRef values[2]; + LLVMValueRef pos[2]; - pos[0] = ac_to_float(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->args->frag_pos[0])); - pos[1] = ac_to_float(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->args->frag_pos[1])); + pos[0] = ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->frag_pos[0])); + pos[1] = ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->frag_pos[1])); - values[0] = ac_build_fract(&ctx->ac, pos[0], 32); - values[1] = ac_build_fract(&ctx->ac, pos[1], 32); - return ac_build_gather_values(&ctx->ac, values, 2); + values[0] = ac_build_fract(&ctx->ac, pos[0], 32); + values[1] = ac_build_fract(&ctx->ac, pos[1], 32); + return ac_build_gather_values(&ctx->ac, values, 2); } -static LLVMValueRef lookup_interp_param(struct ac_nir_context *ctx, - enum glsl_interp_mode interp, unsigned location) +static LLVMValueRef lookup_interp_param(struct ac_nir_context *ctx, enum glsl_interp_mode interp, + unsigned location) { - switch (interp) { - case INTERP_MODE_FLAT: - default: - return NULL; - case INTERP_MODE_SMOOTH: - case INTERP_MODE_NONE: - if (location == INTERP_CENTER) - return ac_get_arg(&ctx->ac, ctx->args->persp_center); - else if (location == INTERP_CENTROID) - return ctx->abi->persp_centroid; - else if (location == INTERP_SAMPLE) - return ac_get_arg(&ctx->ac, ctx->args->persp_sample); - break; - case INTERP_MODE_NOPERSPECTIVE: - if (location == INTERP_CENTER) - return ac_get_arg(&ctx->ac, ctx->args->linear_center); - else if (location == INTERP_CENTROID) - return ctx->abi->linear_centroid; - else if (location == INTERP_SAMPLE) - return ac_get_arg(&ctx->ac, ctx->args->linear_sample); - break; - } - return NULL; + switch (interp) { + case INTERP_MODE_FLAT: + default: + return NULL; + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (location == INTERP_CENTER) + return ac_get_arg(&ctx->ac, ctx->args->persp_center); + else if (location == INTERP_CENTROID) + return ctx->abi->persp_centroid; + else if (location == INTERP_SAMPLE) + return ac_get_arg(&ctx->ac, ctx->args->persp_sample); + break; + case INTERP_MODE_NOPERSPECTIVE: + if (location == INTERP_CENTER) + return ac_get_arg(&ctx->ac, ctx->args->linear_center); + else if (location == INTERP_CENTROID) + return ctx->abi->linear_centroid; + else if (location == INTERP_SAMPLE) + return ac_get_arg(&ctx->ac, ctx->args->linear_sample); + break; + } + return NULL; } -static LLVMValueRef barycentric_center(struct ac_nir_context *ctx, - unsigned mode) +static LLVMValueRef barycentric_center(struct ac_nir_context *ctx, unsigned mode) { - LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); } -static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx, - unsigned mode, - LLVMValueRef offset) +static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx, unsigned mode, + LLVMValueRef offset) { - LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER); - LLVMValueRef src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, "")); - LLVMValueRef src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, "")); - - LLVMValueRef ij_out[2]; - LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); - - /* - * take the I then J parameters, and the DDX/Y for it, and - * calculate the IJ inputs for the interpolator. - * temp1 = ddx * offset/sample.x + I; - * interp_param.I = ddy * offset/sample.y + temp1; - * temp1 = ddx * offset/sample.x + J; - * interp_param.J = ddy * offset/sample.y + temp1; - */ - for (unsigned i = 0; i < 2; i++) { - LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false); - LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false); - LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, - ddxy_out, ix_ll, ""); - LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, - ddxy_out, iy_ll, ""); - LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, - interp_param, ix_ll, ""); - LLVMValueRef temp1, temp2; - - interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, - ctx->ac.f32, ""); - - temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el); - temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1); - - ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, - temp2, ctx->ac.i32, ""); - } - interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER); + LLVMValueRef src_c0 = + ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, "")); + LLVMValueRef src_c1 = + ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, "")); + + LLVMValueRef ij_out[2]; + LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param); + + /* + * take the I then J parameters, and the DDX/Y for it, and + * calculate the IJ inputs for the interpolator. + * temp1 = ddx * offset/sample.x + I; + * interp_param.I = ddy * offset/sample.y + temp1; + * temp1 = ddx * offset/sample.x + J; + * interp_param.J = ddy * offset/sample.y + temp1; + */ + for (unsigned i = 0; i < 2; i++) { + LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false); + LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false); + LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, ix_ll, ""); + LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, iy_ll, ""); + LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ix_ll, ""); + LLVMValueRef temp1, temp2; + + interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, ctx->ac.f32, ""); + + temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el); + temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1); + + ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, temp2, ctx->ac.i32, ""); + } + interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); } -static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx, - unsigned mode) +static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx, unsigned mode) { - LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTROID); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTROID); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); } -static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx, - unsigned mode, - LLVMValueRef sample_id) +static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx, unsigned mode, + LLVMValueRef sample_id) { - if (ctx->abi->interp_at_sample_force_center) - return barycentric_center(ctx, mode); + if (ctx->abi->interp_at_sample_force_center) + return barycentric_center(ctx, mode); - LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f); + LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f); - /* fetch sample ID */ - LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id); + /* fetch sample ID */ + LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id); - LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, ""); - src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, ""); - LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, ""); - src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, ""); - LLVMValueRef coords[] = { src_c0, src_c1 }; - LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2); + LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, ""); + src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, ""); + LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, ""); + src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, ""); + LLVMValueRef coords[] = {src_c0, src_c1}; + LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2); - return barycentric_offset(ctx, mode, offset); + return barycentric_offset(ctx, mode, offset); } - -static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx, - unsigned mode) +static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx, unsigned mode) { - LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_SAMPLE); - return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); + LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_SAMPLE); + return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, ""); } static LLVMValueRef barycentric_model(struct ac_nir_context *ctx) { - return LLVMBuildBitCast(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->pull_model), - ctx->ac.v3i32, ""); + return LLVMBuildBitCast(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->pull_model), + ctx->ac.v3i32, ""); } -static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, - LLVMValueRef interp_param, - unsigned index, unsigned comp_start, - unsigned num_components, - unsigned bitsize) +static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, LLVMValueRef interp_param, + unsigned index, unsigned comp_start, + unsigned num_components, unsigned bitsize) { - LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false); - LLVMValueRef interp_param_f; - - interp_param_f = LLVMBuildBitCast(ctx->ac.builder, - interp_param, ctx->ac.v2f32, ""); - LLVMValueRef i = LLVMBuildExtractElement( - ctx->ac.builder, interp_param_f, ctx->ac.i32_0, ""); - LLVMValueRef j = LLVMBuildExtractElement( - ctx->ac.builder, interp_param_f, ctx->ac.i32_1, ""); - - /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */ - if (ctx->verified_interp && - !_mesa_hash_table_search(ctx->verified_interp, interp_param)) { - LLVMValueRef args[2]; - args[0] = i; - args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false); - LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1, - args, 2, AC_FUNC_ATTR_READNONE); - ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, "")); - _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param); - } - - LLVMValueRef values[4]; - assert(bitsize == 16 || bitsize == 32); - for (unsigned comp = 0; comp < num_components; comp++) { - LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false); - if (bitsize == 16) { - values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number, - ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j); - } else { - values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, - ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j); - } - } - - return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components)); + LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false); + LLVMValueRef interp_param_f; + + interp_param_f = LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, ""); + LLVMValueRef i = LLVMBuildExtractElement(ctx->ac.builder, interp_param_f, ctx->ac.i32_0, ""); + LLVMValueRef j = LLVMBuildExtractElement(ctx->ac.builder, interp_param_f, ctx->ac.i32_1, ""); + + /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */ + if (ctx->verified_interp && !_mesa_hash_table_search(ctx->verified_interp, interp_param)) { + LLVMValueRef args[2]; + args[0] = i; + args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false); + LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1, args, 2, + AC_FUNC_ATTR_READNONE); + ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, "")); + _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param); + } + + LLVMValueRef values[4]; + assert(bitsize == 16 || bitsize == 32); + for (unsigned comp = 0; comp < num_components; comp++) { + LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false); + if (bitsize == 16) { + values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number, + ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j); + } else { + values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, + ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j); + } + } + + return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components)); } -static LLVMValueRef visit_load(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr, bool is_output) +static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr *instr, + bool is_output) { - LLVMValueRef values[8]; - LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa); - LLVMTypeRef component_type; - unsigned base = nir_intrinsic_base(instr); - unsigned component = nir_intrinsic_component(instr); - unsigned count = instr->dest.ssa.num_components * - (instr->dest.ssa.bit_size == 64 ? 2 : 1); - nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); - LLVMValueRef vertex_index = - vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL; - nir_src offset = *nir_get_io_offset_src(instr); - LLVMValueRef indir_index = NULL; - - if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) - component_type = LLVMGetElementType(dest_type); - else - component_type = dest_type; - - if (nir_src_is_const(offset)) - assert(nir_src_as_uint(offset) == 0); - else - indir_index = get_src(ctx, offset); - - if (ctx->stage == MESA_SHADER_TESS_CTRL || - (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) { - LLVMValueRef result = - ctx->abi->load_tess_varyings(ctx->abi, component_type, - vertex_index, indir_index, - 0, 0, base * 4, - component, - instr->num_components, - false, false, !is_output); - if (instr->dest.ssa.bit_size == 16) { - result = ac_to_integer(&ctx->ac, result); - result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); - } - return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); - } - - /* No indirect indexing is allowed after this point. */ - assert(!indir_index); - - if (ctx->stage == MESA_SHADER_GEOMETRY) { - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); - assert(nir_src_is_const(*vertex_index_src)); - - return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component, - instr->num_components, - nir_src_as_uint(*vertex_index_src), - 0, type); - } - - if (ctx->stage == MESA_SHADER_FRAGMENT && is_output && - nir_intrinsic_io_semantics(instr).fb_fetch_output) - return ctx->abi->emit_fbfetch(ctx->abi); - - /* Other non-fragment cases have inputs and outputs in temporaries. */ - if (ctx->stage != MESA_SHADER_FRAGMENT) { - for (unsigned chan = component; chan < count + component; chan++) { - if (is_output) { - values[chan] = LLVMBuildLoad(ctx->ac.builder, - ctx->abi->outputs[base * 4 + chan], ""); - } else { - values[chan] = ctx->abi->inputs[base * 4 + chan]; - if (!values[chan]) - values[chan] = LLVMGetUndef(ctx->ac.i32); - } - } - LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component); - return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); - } - - /* Fragment shader inputs. */ - unsigned vertex_id = 2; /* P0 */ - - if (instr->intrinsic == nir_intrinsic_load_input_vertex) { - nir_const_value *src0 = nir_src_as_const_value(instr->src[0]); - - switch (src0[0].i32) { - case 0: - vertex_id = 2; - break; - case 1: - vertex_id = 0; - break; - case 2: - vertex_id = 1; - break; - default: - unreachable("Invalid vertex index"); - } - } - - LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false); - - for (unsigned chan = 0; chan < count; chan++) { - if (component + chan > 4) - attr_number = LLVMConstInt(ctx->ac.i32, base + 1, false); - LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false); - values[chan] = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->ac.i32, vertex_id, false), - llvm_chan, - attr_number, - ac_get_arg(&ctx->ac, ctx->args->prim_mask)); - values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, ""); - values[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan], - instr->dest.ssa.bit_size == 16 ? ctx->ac.i16 - : ctx->ac.i32, ""); - } - - LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, count); - return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); + LLVMValueRef values[8]; + LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa); + LLVMTypeRef component_type; + unsigned base = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + unsigned count = instr->dest.ssa.num_components * (instr->dest.ssa.bit_size == 64 ? 2 : 1); + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL; + nir_src offset = *nir_get_io_offset_src(instr); + LLVMValueRef indir_index = NULL; + + if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) + component_type = LLVMGetElementType(dest_type); + else + component_type = dest_type; + + if (nir_src_is_const(offset)) + assert(nir_src_as_uint(offset) == 0); + else + indir_index = get_src(ctx, offset); + + if (ctx->stage == MESA_SHADER_TESS_CTRL || (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) { + LLVMValueRef result = ctx->abi->load_tess_varyings( + ctx->abi, component_type, vertex_index, indir_index, 0, 0, base * 4, component, + instr->num_components, false, false, !is_output); + if (instr->dest.ssa.bit_size == 16) { + result = ac_to_integer(&ctx->ac, result); + result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); + } + return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); + } + + /* No indirect indexing is allowed after this point. */ + assert(!indir_index); + + if (ctx->stage == MESA_SHADER_GEOMETRY) { + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + assert(nir_src_is_const(*vertex_index_src)); + + return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component, instr->num_components, + nir_src_as_uint(*vertex_index_src), 0, type); + } + + if (ctx->stage == MESA_SHADER_FRAGMENT && is_output && + nir_intrinsic_io_semantics(instr).fb_fetch_output) + return ctx->abi->emit_fbfetch(ctx->abi); + + /* Other non-fragment cases have inputs and outputs in temporaries. */ + if (ctx->stage != MESA_SHADER_FRAGMENT) { + for (unsigned chan = component; chan < count + component; chan++) { + if (is_output) { + values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->abi->outputs[base * 4 + chan], ""); + } else { + values[chan] = ctx->abi->inputs[base * 4 + chan]; + if (!values[chan]) + values[chan] = LLVMGetUndef(ctx->ac.i32); + } + } + LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component); + return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); + } + + /* Fragment shader inputs. */ + unsigned vertex_id = 2; /* P0 */ + + if (instr->intrinsic == nir_intrinsic_load_input_vertex) { + nir_const_value *src0 = nir_src_as_const_value(instr->src[0]); + + switch (src0[0].i32) { + case 0: + vertex_id = 2; + break; + case 1: + vertex_id = 0; + break; + case 2: + vertex_id = 1; + break; + default: + unreachable("Invalid vertex index"); + } + } + + LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false); + + for (unsigned chan = 0; chan < count; chan++) { + if (component + chan > 4) + attr_number = LLVMConstInt(ctx->ac.i32, base + 1, false); + LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false); + values[chan] = + ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, vertex_id, false), llvm_chan, + attr_number, ac_get_arg(&ctx->ac, ctx->args->prim_mask)); + values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, ""); + values[chan] = + LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan], + instr->dest.ssa.bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, ""); + } + + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, count); + return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, ""); } -static void visit_intrinsic(struct ac_nir_context *ctx, - nir_intrinsic_instr *instr) +static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - LLVMValueRef result = NULL; - - switch (instr->intrinsic) { - case nir_intrinsic_ballot: - result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0])); - if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size) - result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, ""); - break; - case nir_intrinsic_read_invocation: - result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1])); - break; - case nir_intrinsic_read_first_invocation: - result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL); - break; - case nir_intrinsic_load_subgroup_invocation: - result = ac_get_thread_id(&ctx->ac); - break; - case nir_intrinsic_load_work_group_id: { - LLVMValueRef values[3]; - - for (int i = 0; i < 3; i++) { - values[i] = ctx->args->workgroup_ids[i].used ? - ac_get_arg(&ctx->ac, ctx->args->workgroup_ids[i]) : ctx->ac.i32_0; - } - - result = ac_build_gather_values(&ctx->ac, values, 3); - break; - } - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_first_vertex: - result = ctx->abi->load_base_vertex(ctx->abi); - break; - case nir_intrinsic_load_local_group_size: - result = ctx->abi->load_local_group_size(ctx->abi); - break; - case nir_intrinsic_load_vertex_id: - result = LLVMBuildAdd(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->vertex_id), - ac_get_arg(&ctx->ac, ctx->args->base_vertex), ""); - break; - case nir_intrinsic_load_vertex_id_zero_base: { - result = ctx->abi->vertex_id; - break; - } - case nir_intrinsic_load_local_invocation_id: { - result = ac_get_arg(&ctx->ac, ctx->args->local_invocation_ids); - break; - } - case nir_intrinsic_load_base_instance: - result = ac_get_arg(&ctx->ac, ctx->args->start_instance); - break; - case nir_intrinsic_load_draw_id: - result = ac_get_arg(&ctx->ac, ctx->args->draw_id); - break; - case nir_intrinsic_load_view_index: - result = ac_get_arg(&ctx->ac, ctx->args->view_index); - break; - case nir_intrinsic_load_invocation_id: - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - result = ac_unpack_param(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), - 8, 5); - } else { - if (ctx->ac.chip_class >= GFX10) { - result = LLVMBuildAnd(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 127, 0), ""); - } else { - result = ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id); - } - } - break; - case nir_intrinsic_load_primitive_id: - if (ctx->stage == MESA_SHADER_GEOMETRY) { - result = ac_get_arg(&ctx->ac, ctx->args->gs_prim_id); - } else if (ctx->stage == MESA_SHADER_TESS_CTRL) { - result = ac_get_arg(&ctx->ac, ctx->args->tcs_patch_id); - } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - result = ac_get_arg(&ctx->ac, ctx->args->tes_patch_id); - } else - fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage); - break; - case nir_intrinsic_load_sample_id: - result = ac_unpack_param(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->args->ancillary), - 8, 4); - break; - case nir_intrinsic_load_sample_pos: - result = load_sample_pos(ctx); - break; - case nir_intrinsic_load_sample_mask_in: - result = ctx->abi->load_sample_mask_in(ctx->abi); - break; - case nir_intrinsic_load_frag_coord: { - LLVMValueRef values[4] = { - ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]), - ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]), - ac_get_arg(&ctx->ac, ctx->args->frag_pos[2]), - ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, - ac_get_arg(&ctx->ac, ctx->args->frag_pos[3])) - }; - result = ac_to_integer(&ctx->ac, - ac_build_gather_values(&ctx->ac, values, 4)); - break; - } - case nir_intrinsic_load_layer_id: - result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; - break; - case nir_intrinsic_load_front_face: - result = ac_get_arg(&ctx->ac, ctx->args->front_face); - break; - case nir_intrinsic_load_helper_invocation: - result = ac_build_load_helper_invocation(&ctx->ac); - break; - case nir_intrinsic_is_helper_invocation: - result = ac_build_is_helper_invocation(&ctx->ac); - break; - case nir_intrinsic_load_color0: - result = ctx->abi->color0; - break; - case nir_intrinsic_load_color1: - result = ctx->abi->color1; - break; - case nir_intrinsic_load_user_data_amd: - assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32); - result = ctx->abi->user_data; - break; - case nir_intrinsic_load_instance_id: - result = ctx->abi->instance_id; - break; - case nir_intrinsic_load_num_work_groups: - result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups); - break; - case nir_intrinsic_load_local_invocation_index: - result = visit_load_local_invocation_index(ctx); - break; - case nir_intrinsic_load_subgroup_id: - result = visit_load_subgroup_id(ctx); - break; - case nir_intrinsic_load_num_subgroups: - result = visit_load_num_subgroups(ctx); - break; - case nir_intrinsic_first_invocation: - result = visit_first_invocation(ctx); - break; - case nir_intrinsic_load_push_constant: - result = visit_load_push_constant(ctx, instr); - break; - case nir_intrinsic_vulkan_resource_index: { - LLVMValueRef index = get_src(ctx, instr->src[0]); - unsigned desc_set = nir_intrinsic_desc_set(instr); - unsigned binding = nir_intrinsic_binding(instr); - - result = ctx->abi->load_resource(ctx->abi, index, desc_set, - binding); - break; - } - case nir_intrinsic_vulkan_resource_reindex: - result = visit_vulkan_resource_reindex(ctx, instr); - break; - case nir_intrinsic_store_ssbo: - visit_store_ssbo(ctx, instr); - break; - case nir_intrinsic_load_ssbo: - result = visit_load_buffer(ctx, instr); - break; - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - result = visit_atomic_ssbo(ctx, instr); - break; - case nir_intrinsic_load_ubo: - result = visit_load_ubo_buffer(ctx, instr); - break; - case nir_intrinsic_get_buffer_size: - result = visit_get_buffer_size(ctx, instr); - break; - case nir_intrinsic_load_deref: - result = visit_load_var(ctx, instr); - break; - case nir_intrinsic_store_deref: - visit_store_var(ctx, instr); - break; - case nir_intrinsic_load_input: - case nir_intrinsic_load_input_vertex: - case nir_intrinsic_load_per_vertex_input: - result = visit_load(ctx, instr, false); - break; - case nir_intrinsic_load_output: - case nir_intrinsic_load_per_vertex_output: - result = visit_load(ctx, instr, true); - break; - case nir_intrinsic_store_output: - case nir_intrinsic_store_per_vertex_output: - visit_store_output(ctx, instr); - break; - case nir_intrinsic_load_shared: - result = visit_load_shared(ctx, instr); - break; - case nir_intrinsic_store_shared: - visit_store_shared(ctx, instr); - break; - case nir_intrinsic_bindless_image_samples: - case nir_intrinsic_image_deref_samples: - result = visit_image_samples(ctx, instr); - break; - case nir_intrinsic_bindless_image_load: - result = visit_image_load(ctx, instr, true); - break; - case nir_intrinsic_image_deref_load: - result = visit_image_load(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_store: - visit_image_store(ctx, instr, true); - break; - case nir_intrinsic_image_deref_store: - visit_image_store(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_umin: - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_comp_swap: - case nir_intrinsic_bindless_image_atomic_inc_wrap: - case nir_intrinsic_bindless_image_atomic_dec_wrap: - result = visit_image_atomic(ctx, instr, true); - break; - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_inc_wrap: - case nir_intrinsic_image_deref_atomic_dec_wrap: - result = visit_image_atomic(ctx, instr, false); - break; - case nir_intrinsic_bindless_image_size: - result = visit_image_size(ctx, instr, true); - break; - case nir_intrinsic_image_deref_size: - result = visit_image_size(ctx, instr, false); - break; - case nir_intrinsic_shader_clock: - result = ac_build_shader_clock(&ctx->ac, - nir_intrinsic_memory_scope(instr)); - break; - case nir_intrinsic_discard: - case nir_intrinsic_discard_if: - emit_discard(ctx, instr); - break; - case nir_intrinsic_demote: - case nir_intrinsic_demote_if: - emit_demote(ctx, instr); - break; - case nir_intrinsic_memory_barrier: - case nir_intrinsic_group_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - emit_membar(&ctx->ac, instr); - break; - case nir_intrinsic_scoped_barrier: { - assert(!(nir_intrinsic_memory_semantics(instr) & - (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); - - nir_variable_mode modes = nir_intrinsic_memory_modes(instr); - - unsigned wait_flags = 0; - if (modes & (nir_var_mem_global | nir_var_mem_ssbo)) - wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE; - if (modes & nir_var_mem_shared) - wait_flags |= AC_WAIT_LGKM; - - if (wait_flags) - ac_build_waitcnt(&ctx->ac, wait_flags); - - if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP) - ac_emit_barrier(&ctx->ac, ctx->stage); - break; - } - case nir_intrinsic_memory_barrier_tcs_patch: - break; - case nir_intrinsic_control_barrier: - ac_emit_barrier(&ctx->ac, ctx->stage); - break; - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - case nir_intrinsic_shared_atomic_fadd: { - LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], - instr->src[1].ssa->bit_size); - result = visit_var_atomic(ctx, instr, ptr, 1); - break; - } - case nir_intrinsic_deref_atomic_add: - case nir_intrinsic_deref_atomic_imin: - case nir_intrinsic_deref_atomic_umin: - case nir_intrinsic_deref_atomic_imax: - case nir_intrinsic_deref_atomic_umax: - case nir_intrinsic_deref_atomic_and: - case nir_intrinsic_deref_atomic_or: - case nir_intrinsic_deref_atomic_xor: - case nir_intrinsic_deref_atomic_exchange: - case nir_intrinsic_deref_atomic_comp_swap: - case nir_intrinsic_deref_atomic_fadd: { - LLVMValueRef ptr = get_src(ctx, instr->src[0]); - result = visit_var_atomic(ctx, instr, ptr, 1); - break; - } - case nir_intrinsic_load_barycentric_pixel: - result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr)); - break; - case nir_intrinsic_load_barycentric_centroid: - result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr)); - break; - case nir_intrinsic_load_barycentric_sample: - result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr)); - break; - case nir_intrinsic_load_barycentric_model: - result = barycentric_model(ctx); - break; - case nir_intrinsic_load_barycentric_at_offset: { - LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0])); - result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset); - break; - } - case nir_intrinsic_load_barycentric_at_sample: { - LLVMValueRef sample_id = get_src(ctx, instr->src[0]); - result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id); - break; - } - case nir_intrinsic_load_interpolated_input: { - /* We assume any indirect loads have been lowered away */ - ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]); - assert(offset); - assert(offset[0].i32 == 0); - - LLVMValueRef interp_param = get_src(ctx, instr->src[0]); - unsigned index = nir_intrinsic_base(instr); - unsigned component = nir_intrinsic_component(instr); - result = load_interpolated_input(ctx, interp_param, index, - component, - instr->dest.ssa.num_components, - instr->dest.ssa.bit_size); - break; - } - case nir_intrinsic_emit_vertex: - ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs); - break; - case nir_intrinsic_emit_vertex_with_counter: { - unsigned stream = nir_intrinsic_stream_id(instr); - LLVMValueRef next_vertex = get_src(ctx, instr->src[0]); - ctx->abi->emit_vertex_with_counter(ctx->abi, stream, - next_vertex, - ctx->abi->outputs); - break; - } - case nir_intrinsic_end_primitive: - case nir_intrinsic_end_primitive_with_counter: - ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr)); - break; - case nir_intrinsic_load_tess_coord: - result = ctx->abi->load_tess_coord(ctx->abi); - break; - case nir_intrinsic_load_tess_level_outer: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false); - break; - case nir_intrinsic_load_tess_level_inner: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false); - break; - case nir_intrinsic_load_tess_level_outer_default: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true); - break; - case nir_intrinsic_load_tess_level_inner_default: - result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true); - break; - case nir_intrinsic_load_patch_vertices_in: - result = ctx->abi->load_patch_vertices_in(ctx->abi); - break; - case nir_intrinsic_vote_all: { - LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0])); - result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); - break; - } - case nir_intrinsic_vote_any: { - LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0])); - result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); - break; - } - case nir_intrinsic_shuffle: - if (ctx->ac.chip_class == GFX8 || - ctx->ac.chip_class == GFX9 || - (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) { - result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1])); - } else { - LLVMValueRef src = get_src(ctx, instr->src[0]); - LLVMValueRef index = get_src(ctx, instr->src[1]); - LLVMTypeRef type = LLVMTypeOf(src); - struct waterfall_context wctx; - LLVMValueRef index_val; - - index_val = enter_waterfall(ctx, &wctx, index, true); - - src = LLVMBuildZExt(ctx->ac.builder, src, - ctx->ac.i32, ""); - - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane", - ctx->ac.i32, - (LLVMValueRef []) { src, index_val }, 2, - AC_FUNC_ATTR_READNONE | - AC_FUNC_ATTR_CONVERGENT); - - result = LLVMBuildTrunc(ctx->ac.builder, result, type, ""); - - result = exit_waterfall(ctx, &wctx, result); - } - break; - case nir_intrinsic_reduce: - result = ac_build_reduce(&ctx->ac, - get_src(ctx, instr->src[0]), - instr->const_index[0], - instr->const_index[1]); - break; - case nir_intrinsic_inclusive_scan: - result = ac_build_inclusive_scan(&ctx->ac, - get_src(ctx, instr->src[0]), - instr->const_index[0]); - break; - case nir_intrinsic_exclusive_scan: - result = ac_build_exclusive_scan(&ctx->ac, - get_src(ctx, instr->src[0]), - instr->const_index[0]); - break; - case nir_intrinsic_quad_broadcast: { - unsigned lane = nir_src_as_uint(instr->src[1]); - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), - lane, lane, lane, lane); - break; - } - case nir_intrinsic_quad_swap_horizontal: - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3 ,2); - break; - case nir_intrinsic_quad_swap_vertical: - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0 ,1); - break; - case nir_intrinsic_quad_swap_diagonal: - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1 ,0); - break; - case nir_intrinsic_quad_swizzle_amd: { - uint32_t mask = nir_intrinsic_swizzle_mask(instr); - result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), - mask & 0x3, (mask >> 2) & 0x3, - (mask >> 4) & 0x3, (mask >> 6) & 0x3); - break; - } - case nir_intrinsic_masked_swizzle_amd: { - uint32_t mask = nir_intrinsic_swizzle_mask(instr); - result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask); - break; - } - case nir_intrinsic_write_invocation_amd: - result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]), - get_src(ctx, instr->src[1]), - get_src(ctx, instr->src[2])); - break; - case nir_intrinsic_mbcnt_amd: - result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0])); - break; - case nir_intrinsic_load_scratch: { - LLVMValueRef offset = get_src(ctx, instr->src[0]); - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, - offset); - LLVMTypeRef comp_type = - LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); - LLVMTypeRef vec_type = - instr->dest.ssa.num_components == 1 ? comp_type : - LLVMVectorType(comp_type, instr->dest.ssa.num_components); - unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, - LLVMPointerType(vec_type, addr_space), ""); - result = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - break; - } - case nir_intrinsic_store_scratch: { - LLVMValueRef offset = get_src(ctx, instr->src[1]); - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, - offset); - LLVMTypeRef comp_type = - LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size); - unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, - LLVMPointerType(comp_type, addr_space), ""); - LLVMValueRef src = get_src(ctx, instr->src[0]); - unsigned wrmask = nir_intrinsic_write_mask(instr); - while (wrmask) { - int start, count; - u_bit_scan_consecutive_range(&wrmask, &start, &count); - - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, start, false); - LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &offset, 1, ""); - LLVMTypeRef vec_type = - count == 1 ? comp_type : LLVMVectorType(comp_type, count); - offset_ptr = LLVMBuildBitCast(ctx->ac.builder, - offset_ptr, - LLVMPointerType(vec_type, addr_space), - ""); - LLVMValueRef offset_src = - ac_extract_components(&ctx->ac, src, start, count); - LLVMBuildStore(ctx->ac.builder, offset_src, offset_ptr); - } - break; - } - case nir_intrinsic_load_constant: { - unsigned base = nir_intrinsic_base(instr); - unsigned range = nir_intrinsic_range(instr); - - LLVMValueRef offset = get_src(ctx, instr->src[0]); - offset = LLVMBuildAdd(ctx->ac.builder, offset, - LLVMConstInt(ctx->ac.i32, base, false), ""); - - /* Clamp the offset to avoid out-of-bound access because global - * instructions can't handle them. - */ - LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false); - LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - offset, size, ""); - offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, ""); - - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data, - offset); - LLVMTypeRef comp_type = - LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); - LLVMTypeRef vec_type = - instr->dest.ssa.num_components == 1 ? comp_type : - LLVMVectorType(comp_type, instr->dest.ssa.num_components); - unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, - LLVMPointerType(vec_type, addr_space), ""); - result = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - break; - } - default: - fprintf(stderr, "Unknown intrinsic: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - break; - } - if (result) { - ctx->ssa_defs[instr->dest.ssa.index] = result; - } + LLVMValueRef result = NULL; + + switch (instr->intrinsic) { + case nir_intrinsic_ballot: + result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0])); + if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size) + result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, ""); + break; + case nir_intrinsic_read_invocation: + result = + ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1])); + break; + case nir_intrinsic_read_first_invocation: + result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL); + break; + case nir_intrinsic_load_subgroup_invocation: + result = ac_get_thread_id(&ctx->ac); + break; + case nir_intrinsic_load_work_group_id: { + LLVMValueRef values[3]; + + for (int i = 0; i < 3; i++) { + values[i] = ctx->args->workgroup_ids[i].used + ? ac_get_arg(&ctx->ac, ctx->args->workgroup_ids[i]) + : ctx->ac.i32_0; + } + + result = ac_build_gather_values(&ctx->ac, values, 3); + break; + } + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_first_vertex: + result = ctx->abi->load_base_vertex(ctx->abi); + break; + case nir_intrinsic_load_local_group_size: + result = ctx->abi->load_local_group_size(ctx->abi); + break; + case nir_intrinsic_load_vertex_id: + result = LLVMBuildAdd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->vertex_id), + ac_get_arg(&ctx->ac, ctx->args->base_vertex), ""); + break; + case nir_intrinsic_load_vertex_id_zero_base: { + result = ctx->abi->vertex_id; + break; + } + case nir_intrinsic_load_local_invocation_id: { + result = ac_get_arg(&ctx->ac, ctx->args->local_invocation_ids); + break; + } + case nir_intrinsic_load_base_instance: + result = ac_get_arg(&ctx->ac, ctx->args->start_instance); + break; + case nir_intrinsic_load_draw_id: + result = ac_get_arg(&ctx->ac, ctx->args->draw_id); + break; + case nir_intrinsic_load_view_index: + result = ac_get_arg(&ctx->ac, ctx->args->view_index); + break; + case nir_intrinsic_load_invocation_id: + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), 8, 5); + } else { + if (ctx->ac.chip_class >= GFX10) { + result = + LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 127, 0), ""); + } else { + result = ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id); + } + } + break; + case nir_intrinsic_load_primitive_id: + if (ctx->stage == MESA_SHADER_GEOMETRY) { + result = ac_get_arg(&ctx->ac, ctx->args->gs_prim_id); + } else if (ctx->stage == MESA_SHADER_TESS_CTRL) { + result = ac_get_arg(&ctx->ac, ctx->args->tcs_patch_id); + } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { + result = ac_get_arg(&ctx->ac, ctx->args->tes_patch_id); + } else + fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage); + break; + case nir_intrinsic_load_sample_id: + result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ancillary), 8, 4); + break; + case nir_intrinsic_load_sample_pos: + result = load_sample_pos(ctx); + break; + case nir_intrinsic_load_sample_mask_in: + result = ctx->abi->load_sample_mask_in(ctx->abi); + break; + case nir_intrinsic_load_frag_coord: { + LLVMValueRef values[4] = { + ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]), ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]), + ac_get_arg(&ctx->ac, ctx->args->frag_pos[2]), + ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ac_get_arg(&ctx->ac, ctx->args->frag_pos[3]))}; + result = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); + break; + } + case nir_intrinsic_load_layer_id: + result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; + break; + case nir_intrinsic_load_front_face: + result = ac_get_arg(&ctx->ac, ctx->args->front_face); + break; + case nir_intrinsic_load_helper_invocation: + result = ac_build_load_helper_invocation(&ctx->ac); + break; + case nir_intrinsic_is_helper_invocation: + result = ac_build_is_helper_invocation(&ctx->ac); + break; + case nir_intrinsic_load_color0: + result = ctx->abi->color0; + break; + case nir_intrinsic_load_color1: + result = ctx->abi->color1; + break; + case nir_intrinsic_load_user_data_amd: + assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32); + result = ctx->abi->user_data; + break; + case nir_intrinsic_load_instance_id: + result = ctx->abi->instance_id; + break; + case nir_intrinsic_load_num_work_groups: + result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups); + break; + case nir_intrinsic_load_local_invocation_index: + result = visit_load_local_invocation_index(ctx); + break; + case nir_intrinsic_load_subgroup_id: + result = visit_load_subgroup_id(ctx); + break; + case nir_intrinsic_load_num_subgroups: + result = visit_load_num_subgroups(ctx); + break; + case nir_intrinsic_first_invocation: + result = visit_first_invocation(ctx); + break; + case nir_intrinsic_load_push_constant: + result = visit_load_push_constant(ctx, instr); + break; + case nir_intrinsic_vulkan_resource_index: { + LLVMValueRef index = get_src(ctx, instr->src[0]); + unsigned desc_set = nir_intrinsic_desc_set(instr); + unsigned binding = nir_intrinsic_binding(instr); + + result = ctx->abi->load_resource(ctx->abi, index, desc_set, binding); + break; + } + case nir_intrinsic_vulkan_resource_reindex: + result = visit_vulkan_resource_reindex(ctx, instr); + break; + case nir_intrinsic_store_ssbo: + visit_store_ssbo(ctx, instr); + break; + case nir_intrinsic_load_ssbo: + result = visit_load_buffer(ctx, instr); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + result = visit_atomic_ssbo(ctx, instr); + break; + case nir_intrinsic_load_ubo: + result = visit_load_ubo_buffer(ctx, instr); + break; + case nir_intrinsic_get_buffer_size: + result = visit_get_buffer_size(ctx, instr); + break; + case nir_intrinsic_load_deref: + result = visit_load_var(ctx, instr); + break; + case nir_intrinsic_store_deref: + visit_store_var(ctx, instr); + break; + case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: + case nir_intrinsic_load_per_vertex_input: + result = visit_load(ctx, instr, false); + break; + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: + result = visit_load(ctx, instr, true); + break; + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + visit_store_output(ctx, instr); + break; + case nir_intrinsic_load_shared: + result = visit_load_shared(ctx, instr); + break; + case nir_intrinsic_store_shared: + visit_store_shared(ctx, instr); + break; + case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_image_deref_samples: + result = visit_image_samples(ctx, instr); + break; + case nir_intrinsic_bindless_image_load: + result = visit_image_load(ctx, instr, true); + break; + case nir_intrinsic_image_deref_load: + result = visit_image_load(ctx, instr, false); + break; + case nir_intrinsic_bindless_image_store: + visit_image_store(ctx, instr, true); + break; + case nir_intrinsic_image_deref_store: + visit_image_store(ctx, instr, false); + break; + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_bindless_image_atomic_dec_wrap: + result = visit_image_atomic(ctx, instr, true); + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_inc_wrap: + case nir_intrinsic_image_deref_atomic_dec_wrap: + result = visit_image_atomic(ctx, instr, false); + break; + case nir_intrinsic_bindless_image_size: + result = visit_image_size(ctx, instr, true); + break; + case nir_intrinsic_image_deref_size: + result = visit_image_size(ctx, instr, false); + break; + case nir_intrinsic_shader_clock: + result = ac_build_shader_clock(&ctx->ac, nir_intrinsic_memory_scope(instr)); + break; + case nir_intrinsic_discard: + case nir_intrinsic_discard_if: + emit_discard(ctx, instr); + break; + case nir_intrinsic_demote: + case nir_intrinsic_demote_if: + emit_demote(ctx, instr); + break; + case nir_intrinsic_memory_barrier: + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + emit_membar(&ctx->ac, instr); + break; + case nir_intrinsic_scoped_barrier: { + assert(!(nir_intrinsic_memory_semantics(instr) & + (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); + + nir_variable_mode modes = nir_intrinsic_memory_modes(instr); + + unsigned wait_flags = 0; + if (modes & (nir_var_mem_global | nir_var_mem_ssbo)) + wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE; + if (modes & nir_var_mem_shared) + wait_flags |= AC_WAIT_LGKM; + + if (wait_flags) + ac_build_waitcnt(&ctx->ac, wait_flags); + + if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP) + ac_emit_barrier(&ctx->ac, ctx->stage); + break; + } + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_control_barrier: + ac_emit_barrier(&ctx->ac, ctx->stage); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_fadd: { + LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->src[1].ssa->bit_size); + result = visit_var_atomic(ctx, instr, ptr, 1); + break; + } + case nir_intrinsic_deref_atomic_add: + case nir_intrinsic_deref_atomic_imin: + case nir_intrinsic_deref_atomic_umin: + case nir_intrinsic_deref_atomic_imax: + case nir_intrinsic_deref_atomic_umax: + case nir_intrinsic_deref_atomic_and: + case nir_intrinsic_deref_atomic_or: + case nir_intrinsic_deref_atomic_xor: + case nir_intrinsic_deref_atomic_exchange: + case nir_intrinsic_deref_atomic_comp_swap: + case nir_intrinsic_deref_atomic_fadd: { + LLVMValueRef ptr = get_src(ctx, instr->src[0]); + result = visit_var_atomic(ctx, instr, ptr, 1); + break; + } + case nir_intrinsic_load_barycentric_pixel: + result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr)); + break; + case nir_intrinsic_load_barycentric_centroid: + result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr)); + break; + case nir_intrinsic_load_barycentric_sample: + result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr)); + break; + case nir_intrinsic_load_barycentric_model: + result = barycentric_model(ctx); + break; + case nir_intrinsic_load_barycentric_at_offset: { + LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0])); + result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset); + break; + } + case nir_intrinsic_load_barycentric_at_sample: { + LLVMValueRef sample_id = get_src(ctx, instr->src[0]); + result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id); + break; + } + case nir_intrinsic_load_interpolated_input: { + /* We assume any indirect loads have been lowered away */ + ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]); + assert(offset); + assert(offset[0].i32 == 0); + + LLVMValueRef interp_param = get_src(ctx, instr->src[0]); + unsigned index = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + result = load_interpolated_input(ctx, interp_param, index, component, + instr->dest.ssa.num_components, instr->dest.ssa.bit_size); + break; + } + case nir_intrinsic_emit_vertex: + ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs); + break; + case nir_intrinsic_emit_vertex_with_counter: { + unsigned stream = nir_intrinsic_stream_id(instr); + LLVMValueRef next_vertex = get_src(ctx, instr->src[0]); + ctx->abi->emit_vertex_with_counter(ctx->abi, stream, next_vertex, ctx->abi->outputs); + break; + } + case nir_intrinsic_end_primitive: + case nir_intrinsic_end_primitive_with_counter: + ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr)); + break; + case nir_intrinsic_load_tess_coord: + result = ctx->abi->load_tess_coord(ctx->abi); + break; + case nir_intrinsic_load_tess_level_outer: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false); + break; + case nir_intrinsic_load_tess_level_inner: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false); + break; + case nir_intrinsic_load_tess_level_outer_default: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true); + break; + case nir_intrinsic_load_tess_level_inner_default: + result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true); + break; + case nir_intrinsic_load_patch_vertices_in: + result = ctx->abi->load_patch_vertices_in(ctx->abi); + break; + case nir_intrinsic_vote_all: { + LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + case nir_intrinsic_vote_any: { + LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0])); + result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, ""); + break; + } + case nir_intrinsic_shuffle: + if (ctx->ac.chip_class == GFX8 || ctx->ac.chip_class == GFX9 || + (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) { + result = + ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1])); + } else { + LLVMValueRef src = get_src(ctx, instr->src[0]); + LLVMValueRef index = get_src(ctx, instr->src[1]); + LLVMTypeRef type = LLVMTypeOf(src); + struct waterfall_context wctx; + LLVMValueRef index_val; + + index_val = enter_waterfall(ctx, &wctx, index, true); + + src = LLVMBuildZExt(ctx->ac.builder, src, ctx->ac.i32, ""); + + result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane", ctx->ac.i32, + (LLVMValueRef[]){src, index_val}, 2, + AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT); + + result = LLVMBuildTrunc(ctx->ac.builder, result, type, ""); + + result = exit_waterfall(ctx, &wctx, result); + } + break; + case nir_intrinsic_reduce: + result = ac_build_reduce(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0], + instr->const_index[1]); + break; + case nir_intrinsic_inclusive_scan: + result = + ac_build_inclusive_scan(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0]); + break; + case nir_intrinsic_exclusive_scan: + result = + ac_build_exclusive_scan(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0]); + break; + case nir_intrinsic_quad_broadcast: { + unsigned lane = nir_src_as_uint(instr->src[1]); + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), lane, lane, lane, lane); + break; + } + case nir_intrinsic_quad_swap_horizontal: + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3, 2); + break; + case nir_intrinsic_quad_swap_vertical: + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0, 1); + break; + case nir_intrinsic_quad_swap_diagonal: + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1, 0); + break; + case nir_intrinsic_quad_swizzle_amd: { + uint32_t mask = nir_intrinsic_swizzle_mask(instr); + result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask & 0x3, + (mask >> 2) & 0x3, (mask >> 4) & 0x3, (mask >> 6) & 0x3); + break; + } + case nir_intrinsic_masked_swizzle_amd: { + uint32_t mask = nir_intrinsic_swizzle_mask(instr); + result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask); + break; + } + case nir_intrinsic_write_invocation_amd: + result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]), + get_src(ctx, instr->src[1]), get_src(ctx, instr->src[2])); + break; + case nir_intrinsic_mbcnt_amd: + result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0])); + break; + case nir_intrinsic_load_scratch: { + LLVMValueRef offset = get_src(ctx, instr->src[0]); + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, offset); + LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMTypeRef vec_type = instr->dest.ssa.num_components == 1 + ? comp_type + : LLVMVectorType(comp_type, instr->dest.ssa.num_components); + unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(vec_type, addr_space), ""); + result = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + break; + } + case nir_intrinsic_store_scratch: { + LLVMValueRef offset = get_src(ctx, instr->src[1]); + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, offset); + LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size); + unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(comp_type, addr_space), ""); + LLVMValueRef src = get_src(ctx, instr->src[0]); + unsigned wrmask = nir_intrinsic_write_mask(instr); + while (wrmask) { + int start, count; + u_bit_scan_consecutive_range(&wrmask, &start, &count); + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, start, false); + LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &offset, 1, ""); + LLVMTypeRef vec_type = count == 1 ? comp_type : LLVMVectorType(comp_type, count); + offset_ptr = LLVMBuildBitCast(ctx->ac.builder, offset_ptr, + LLVMPointerType(vec_type, addr_space), ""); + LLVMValueRef offset_src = ac_extract_components(&ctx->ac, src, start, count); + LLVMBuildStore(ctx->ac.builder, offset_src, offset_ptr); + } + break; + } + case nir_intrinsic_load_constant: { + unsigned base = nir_intrinsic_base(instr); + unsigned range = nir_intrinsic_range(instr); + + LLVMValueRef offset = get_src(ctx, instr->src[0]); + offset = LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, base, false), ""); + + /* Clamp the offset to avoid out-of-bound access because global + * instructions can't handle them. + */ + LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false); + LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, ""); + offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, ""); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data, offset); + LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); + LLVMTypeRef vec_type = instr->dest.ssa.num_components == 1 + ? comp_type + : LLVMVectorType(comp_type, instr->dest.ssa.num_components); + unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(vec_type, addr_space), ""); + result = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + break; + } + default: + fprintf(stderr, "Unknown intrinsic: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + break; + } + if (result) { + ctx->ssa_defs[instr->dest.ssa.index] = result; + } } -static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx, - unsigned base_index, - unsigned constant_index, - LLVMValueRef dynamic_index) +static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx, unsigned base_index, + unsigned constant_index, + LLVMValueRef dynamic_index) { - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0); - LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, - LLVMConstInt(ctx->ac.i32, constant_index, 0), ""); + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0); + LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, + LLVMConstInt(ctx->ac.i32, constant_index, 0), ""); - /* Bindless uniforms are 64bit so multiple index by 8 */ - index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), ""); - offset = LLVMBuildAdd(ctx->ac.builder, offset, index, ""); + /* Bindless uniforms are 64bit so multiple index by 8 */ + index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), ""); + offset = LLVMBuildAdd(ctx->ac.builder, offset, index, ""); - LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, ctx->ac.i32_0); + LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, ctx->ac.i32_0); - LLVMValueRef ret = ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset, - NULL, 0, 0, true, true); + LLVMValueRef ret = + ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset, NULL, 0, 0, true, true); - return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, ""); + return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, ""); } struct sampler_desc_address { - unsigned descriptor_set; - unsigned base_index; /* binding in vulkan */ - unsigned constant_index; - LLVMValueRef dynamic_index; - bool image; - bool bindless; + unsigned descriptor_set; + unsigned base_index; /* binding in vulkan */ + unsigned constant_index; + LLVMValueRef dynamic_index; + bool image; + bool bindless; }; -static struct sampler_desc_address -get_sampler_desc_internal(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - const nir_instr *instr, - bool image) +static struct sampler_desc_address get_sampler_desc_internal(struct ac_nir_context *ctx, + nir_deref_instr *deref_instr, + const nir_instr *instr, bool image) { - LLVMValueRef index = NULL; - unsigned constant_index = 0; - unsigned descriptor_set; - unsigned base_index; - bool bindless = false; - - if (!deref_instr) { - descriptor_set = 0; - if (image) { - nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr); - base_index = 0; - bindless = true; - index = get_src(ctx, img_instr->src[0]); - } else { - nir_tex_instr *tex_instr = nir_instr_as_tex(instr); - int sampSrcIdx = nir_tex_instr_src_index(tex_instr, - nir_tex_src_sampler_handle); - if (sampSrcIdx != -1) { - base_index = 0; - bindless = true; - index = get_src(ctx, tex_instr->src[sampSrcIdx].src); - } else { - assert(tex_instr && !image); - base_index = tex_instr->sampler_index; - } - } - } else { - while(deref_instr->deref_type != nir_deref_type_var) { - if (deref_instr->deref_type == nir_deref_type_array) { - unsigned array_size = glsl_get_aoa_size(deref_instr->type); - if (!array_size) - array_size = 1; - - if (nir_src_is_const(deref_instr->arr.index)) { - constant_index += array_size * nir_src_as_uint(deref_instr->arr.index); - } else { - LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index); - - indirect = LLVMBuildMul(ctx->ac.builder, indirect, - LLVMConstInt(ctx->ac.i32, array_size, false), ""); - - if (!index) - index = indirect; - else - index = LLVMBuildAdd(ctx->ac.builder, index, indirect, ""); - } - - deref_instr = nir_src_as_deref(deref_instr->parent); - } else if (deref_instr->deref_type == nir_deref_type_struct) { - unsigned sidx = deref_instr->strct.index; - deref_instr = nir_src_as_deref(deref_instr->parent); - constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx); - } else { - unreachable("Unsupported deref type"); - } - } - descriptor_set = deref_instr->var->data.descriptor_set; - - if (deref_instr->var->data.bindless) { - /* For now just assert on unhandled variable types */ - assert(deref_instr->var->data.mode == nir_var_uniform); - - base_index = deref_instr->var->data.driver_location; - bindless = true; - - index = index ? index : ctx->ac.i32_0; - index = get_bindless_index_from_uniform(ctx, base_index, - constant_index, index); - } else - base_index = deref_instr->var->data.binding; - } - return (struct sampler_desc_address) { - .descriptor_set = descriptor_set, - .base_index = base_index, - .constant_index = constant_index, - .dynamic_index = index, - .image = image, - .bindless = bindless, - }; + LLVMValueRef index = NULL; + unsigned constant_index = 0; + unsigned descriptor_set; + unsigned base_index; + bool bindless = false; + + if (!deref_instr) { + descriptor_set = 0; + if (image) { + nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr); + base_index = 0; + bindless = true; + index = get_src(ctx, img_instr->src[0]); + } else { + nir_tex_instr *tex_instr = nir_instr_as_tex(instr); + int sampSrcIdx = nir_tex_instr_src_index(tex_instr, nir_tex_src_sampler_handle); + if (sampSrcIdx != -1) { + base_index = 0; + bindless = true; + index = get_src(ctx, tex_instr->src[sampSrcIdx].src); + } else { + assert(tex_instr && !image); + base_index = tex_instr->sampler_index; + } + } + } else { + while (deref_instr->deref_type != nir_deref_type_var) { + if (deref_instr->deref_type == nir_deref_type_array) { + unsigned array_size = glsl_get_aoa_size(deref_instr->type); + if (!array_size) + array_size = 1; + + if (nir_src_is_const(deref_instr->arr.index)) { + constant_index += array_size * nir_src_as_uint(deref_instr->arr.index); + } else { + LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index); + + indirect = LLVMBuildMul(ctx->ac.builder, indirect, + LLVMConstInt(ctx->ac.i32, array_size, false), ""); + + if (!index) + index = indirect; + else + index = LLVMBuildAdd(ctx->ac.builder, index, indirect, ""); + } + + deref_instr = nir_src_as_deref(deref_instr->parent); + } else if (deref_instr->deref_type == nir_deref_type_struct) { + unsigned sidx = deref_instr->strct.index; + deref_instr = nir_src_as_deref(deref_instr->parent); + constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx); + } else { + unreachable("Unsupported deref type"); + } + } + descriptor_set = deref_instr->var->data.descriptor_set; + + if (deref_instr->var->data.bindless) { + /* For now just assert on unhandled variable types */ + assert(deref_instr->var->data.mode == nir_var_uniform); + + base_index = deref_instr->var->data.driver_location; + bindless = true; + + index = index ? index : ctx->ac.i32_0; + index = get_bindless_index_from_uniform(ctx, base_index, constant_index, index); + } else + base_index = deref_instr->var->data.binding; + } + return (struct sampler_desc_address){ + .descriptor_set = descriptor_set, + .base_index = base_index, + .constant_index = constant_index, + .dynamic_index = index, + .image = image, + .bindless = bindless, + }; } /* Extract any possibly divergent index into a separate value that can be fed * into get_sampler_desc with the same arguments. */ -static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - const nir_instr *instr, - bool image) +static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr, + const nir_instr *instr, bool image) { - struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image); - return addr.dynamic_index; + struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image); + return addr.dynamic_index; } -static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, - nir_deref_instr *deref_instr, - enum ac_descriptor_type desc_type, - const nir_instr *instr, - LLVMValueRef index, - bool image, bool write) +static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, nir_deref_instr *deref_instr, + enum ac_descriptor_type desc_type, const nir_instr *instr, + LLVMValueRef index, bool image, bool write) { - struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image); - return ctx->abi->load_sampler_desc(ctx->abi, - addr.descriptor_set, - addr.base_index, - addr.constant_index, index, - desc_type, addr.image, write, addr.bindless); + struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image); + return ctx->abi->load_sampler_desc(ctx->abi, addr.descriptor_set, addr.base_index, + addr.constant_index, index, desc_type, addr.image, write, + addr.bindless); } /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. @@ -4508,1232 +4182,1129 @@ static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, * GFX8: * The ANISO_OVERRIDE sampler field enables this fix in TA. */ -static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx, - LLVMValueRef res, LLVMValueRef samp) +static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx, LLVMValueRef res, + LLVMValueRef samp) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef img7, samp0; - - if (ctx->ac.chip_class >= GFX8) - return samp; - - img7 = LLVMBuildExtractElement(builder, res, - LLVMConstInt(ctx->ac.i32, 7, 0), ""); - samp0 = LLVMBuildExtractElement(builder, samp, - LLVMConstInt(ctx->ac.i32, 0, 0), ""); - samp0 = LLVMBuildAnd(builder, samp0, img7, ""); - return LLVMBuildInsertElement(builder, samp, samp0, - LLVMConstInt(ctx->ac.i32, 0, 0), ""); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef img7, samp0; + + if (ctx->ac.chip_class >= GFX8) + return samp; + + img7 = LLVMBuildExtractElement(builder, res, LLVMConstInt(ctx->ac.i32, 7, 0), ""); + samp0 = LLVMBuildExtractElement(builder, samp, LLVMConstInt(ctx->ac.i32, 0, 0), ""); + samp0 = LLVMBuildAnd(builder, samp0, img7, ""); + return LLVMBuildInsertElement(builder, samp, samp0, LLVMConstInt(ctx->ac.i32, 0, 0), ""); } -static void tex_fetch_ptrs(struct ac_nir_context *ctx, - nir_tex_instr *instr, - struct waterfall_context *wctx, - LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, - LLVMValueRef *fmask_ptr) +static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr, + struct waterfall_context *wctx, LLVMValueRef *res_ptr, + LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr) { - nir_deref_instr *texture_deref_instr = NULL; - nir_deref_instr *sampler_deref_instr = NULL; - int plane = -1; - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_texture_deref: - texture_deref_instr = nir_src_as_deref(instr->src[i].src); - break; - case nir_tex_src_sampler_deref: - sampler_deref_instr = nir_src_as_deref(instr->src[i].src); - break; - case nir_tex_src_plane: - plane = nir_src_as_int(instr->src[i].src); - break; - default: - break; - } - } - - LLVMValueRef texture_dynamic_index = get_sampler_desc_index(ctx, texture_deref_instr, - &instr->instr, false); - if (!sampler_deref_instr) - sampler_deref_instr = texture_deref_instr; - - LLVMValueRef sampler_dynamic_index = get_sampler_desc_index(ctx, sampler_deref_instr, - &instr->instr, false); - if (instr->texture_non_uniform) - texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true); - - if (instr->sampler_non_uniform) - sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true); - - enum ac_descriptor_type main_descriptor = instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; - - if (plane >= 0) { - assert(instr->op != nir_texop_txf_ms && - instr->op != nir_texop_samples_identical); - assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); - - main_descriptor = AC_DESC_PLANE_0 + plane; - } - - if (instr->op == nir_texop_fragment_mask_fetch) { - /* The fragment mask is fetched from the compressed - * multisampled surface. - */ - main_descriptor = AC_DESC_FMASK; - } - - *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, - texture_dynamic_index, false, false); - - if (samp_ptr) { - *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, - sampler_dynamic_index, false, false); - if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT) - *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); - } - if (fmask_ptr && (instr->op == nir_texop_txf_ms || - instr->op == nir_texop_samples_identical)) - *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, - &instr->instr, texture_dynamic_index, false, false); + nir_deref_instr *texture_deref_instr = NULL; + nir_deref_instr *sampler_deref_instr = NULL; + int plane = -1; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_sampler_deref: + sampler_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_plane: + plane = nir_src_as_int(instr->src[i].src); + break; + default: + break; + } + } + + LLVMValueRef texture_dynamic_index = + get_sampler_desc_index(ctx, texture_deref_instr, &instr->instr, false); + if (!sampler_deref_instr) + sampler_deref_instr = texture_deref_instr; + + LLVMValueRef sampler_dynamic_index = + get_sampler_desc_index(ctx, sampler_deref_instr, &instr->instr, false); + if (instr->texture_non_uniform) + texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true); + + if (instr->sampler_non_uniform) + sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true); + + enum ac_descriptor_type main_descriptor = + instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; + + if (plane >= 0) { + assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical); + assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); + + main_descriptor = AC_DESC_PLANE_0 + plane; + } + + if (instr->op == nir_texop_fragment_mask_fetch) { + /* The fragment mask is fetched from the compressed + * multisampled surface. + */ + main_descriptor = AC_DESC_FMASK; + } + + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr, + texture_dynamic_index, false, false); + + if (samp_ptr) { + *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr, + sampler_dynamic_index, false, false); + if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT) + *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); + } + if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical)) + *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr, + texture_dynamic_index, false, false); } -static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx, - LLVMValueRef coord) +static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx, LLVMValueRef coord) { - coord = ac_to_float(ctx, coord); - coord = ac_build_round(ctx, coord); - coord = ac_to_integer(ctx, coord); - return coord; + coord = ac_to_float(ctx, coord); + coord = ac_build_round(ctx, coord); + coord = ac_to_integer(ctx, coord); + return coord; } static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) { - LLVMValueRef result = NULL; - struct ac_image_args args = { 0 }; - LLVMValueRef fmask_ptr = NULL, sample_index = NULL; - LLVMValueRef ddx = NULL, ddy = NULL; - unsigned offset_src = 0; - struct waterfall_context wctx[2] = {{{0}}}; - - tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr); - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_coord: { - LLVMValueRef coord = get_src(ctx, instr->src[i].src); - for (unsigned chan = 0; chan < instr->coord_components; ++chan) - args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan); - break; - } - case nir_tex_src_projector: - break; - case nir_tex_src_comparator: - if (instr->is_shadow) { - args.compare = get_src(ctx, instr->src[i].src); - args.compare = ac_to_float(&ctx->ac, args.compare); - } - break; - case nir_tex_src_offset: - args.offset = get_src(ctx, instr->src[i].src); - offset_src = i; - break; - case nir_tex_src_bias: - args.bias = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_lod: { - if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) - args.level_zero = true; - else - args.lod = get_src(ctx, instr->src[i].src); - break; - } - case nir_tex_src_ms_index: - sample_index = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_ms_mcs: - break; - case nir_tex_src_ddx: - ddx = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_ddy: - ddy = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_min_lod: - args.min_lod = get_src(ctx, instr->src[i].src); - break; - case nir_tex_src_texture_offset: - case nir_tex_src_sampler_offset: - case nir_tex_src_plane: - default: - break; - } - } - - if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { - result = get_buffer_size(ctx, args.resource, true); - goto write_result; - } - - if (instr->op == nir_texop_texture_samples) { - LLVMValueRef res, samples, is_msaa; - LLVMValueRef default_sample; - - res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, ""); - samples = LLVMBuildExtractElement(ctx->ac.builder, res, - LLVMConstInt(ctx->ac.i32, 3, false), ""); - is_msaa = LLVMBuildLShr(ctx->ac.builder, samples, - LLVMConstInt(ctx->ac.i32, 28, false), ""); - is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa, - LLVMConstInt(ctx->ac.i32, 0xe, false), ""); - is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa, - LLVMConstInt(ctx->ac.i32, 0xe, false), ""); - - samples = LLVMBuildLShr(ctx->ac.builder, samples, - LLVMConstInt(ctx->ac.i32, 16, false), ""); - samples = LLVMBuildAnd(ctx->ac.builder, samples, - LLVMConstInt(ctx->ac.i32, 0xf, false), ""); - samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, - samples, ""); - - if (ctx->abi->robust_buffer_access) { - LLVMValueRef dword1, is_null_descriptor; - - /* Extract the second dword of the descriptor, if it's - * all zero, then it's a null descriptor. - */ - dword1 = LLVMBuildExtractElement(ctx->ac.builder, res, - LLVMConstInt(ctx->ac.i32, 1, false), ""); - is_null_descriptor = - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1, - LLVMConstInt(ctx->ac.i32, 0, false), ""); - default_sample = - LLVMBuildSelect(ctx->ac.builder, is_null_descriptor, - ctx->ac.i32_0, ctx->ac.i32_1, ""); - } else { - default_sample = ctx->ac.i32_1; - } - - samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples, - default_sample, ""); - result = samples; - goto write_result; - } - - if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { - LLVMValueRef offset[3], pack; - for (unsigned chan = 0; chan < 3; ++chan) - offset[chan] = ctx->ac.i32_0; - - unsigned num_components = ac_get_llvm_num_components(args.offset); - for (unsigned chan = 0; chan < num_components; chan++) { - offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan); - offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan], - LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); - if (chan) - offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan], - LLVMConstInt(ctx->ac.i32, chan * 8, false), ""); - } - pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], ""); - pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], ""); - args.offset = pack; - } - - /* Section 8.23.1 (Depth Texture Comparison Mode) of the - * OpenGL 4.5 spec says: - * - * "If the texture’s internal format indicates a fixed-point - * depth texture, then D_t and D_ref are clamped to the - * range [0, 1]; otherwise no clamping is performed." - * - * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT, - * so the depth comparison value isn't clamped for Z16 and - * Z24 anymore. Do it manually here for GFX8-9; GFX10 has - * an explicitly clamped 32-bit float format. - */ - if (args.compare && - ctx->ac.chip_class >= GFX8 && - ctx->ac.chip_class <= GFX9 && - ctx->abi->clamp_shadow_reference) { - LLVMValueRef upgraded, clamped; - - upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler, - LLVMConstInt(ctx->ac.i32, 3, false), ""); - upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded, - LLVMConstInt(ctx->ac.i32, 29, false), ""); - upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->ac.i1, ""); - clamped = ac_build_clamp(&ctx->ac, args.compare); - args.compare = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, - args.compare, ""); - } - - /* pack derivatives */ - if (ddx || ddy) { - int num_src_deriv_channels, num_dest_deriv_channels; - switch (instr->sampler_dim) { - case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - num_src_deriv_channels = 3; - num_dest_deriv_channels = 3; - break; - case GLSL_SAMPLER_DIM_2D: - default: - num_src_deriv_channels = 2; - num_dest_deriv_channels = 2; - break; - case GLSL_SAMPLER_DIM_1D: - num_src_deriv_channels = 1; - if (ctx->ac.chip_class == GFX9) { - num_dest_deriv_channels = 2; - } else { - num_dest_deriv_channels = 1; - } - break; - } - - for (unsigned i = 0; i < num_src_deriv_channels; i++) { - args.derivs[i] = ac_to_float(&ctx->ac, - ac_llvm_extract_elem(&ctx->ac, ddx, i)); - args.derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, - ac_llvm_extract_elem(&ctx->ac, ddy, i)); - } - for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) { - args.derivs[i] = ctx->ac.f32_0; - args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0; - } - } - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) { - for (unsigned chan = 0; chan < instr->coord_components; chan++) - args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]); - if (instr->coord_components == 3) - args.coords[3] = LLVMGetUndef(ctx->ac.f32); - ac_prepare_cube_coords(&ctx->ac, - instr->op == nir_texop_txd, instr->is_array, - instr->op == nir_texop_lod, args.coords, args.derivs); - } - - /* Texture coordinates fixups */ - if (instr->coord_components > 1 && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->is_array && - instr->op != nir_texop_txf) { - args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]); - } - - if (instr->coord_components > 2 && - (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS || - instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || - instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && - instr->is_array && - instr->op != nir_texop_txf && - instr->op != nir_texop_txf_ms && - instr->op != nir_texop_fragment_fetch && - instr->op != nir_texop_fragment_mask_fetch) { - args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]); - } - - if (ctx->ac.chip_class == GFX9 && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->op != nir_texop_lod) { - LLVMValueRef filler; - if (instr->op == nir_texop_txf) - filler = ctx->ac.i32_0; - else - filler = LLVMConstReal(ctx->ac.f32, 0.5); - - if (instr->is_array) - args.coords[2] = args.coords[1]; - args.coords[1] = filler; - } - - /* Pack sample index */ - if (sample_index && (instr->op == nir_texop_txf_ms || - instr->op == nir_texop_fragment_fetch)) - args.coords[instr->coord_components] = sample_index; - - if (instr->op == nir_texop_samples_identical) { - struct ac_image_args txf_args = { 0 }; - memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords)); - - txf_args.dmask = 0xf; - txf_args.resource = fmask_ptr; - txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d; - result = build_tex_intrinsic(ctx, instr, &txf_args); - - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0); - goto write_result; - } - - if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS) && - instr->op != nir_texop_txs && - instr->op != nir_texop_fragment_fetch && - instr->op != nir_texop_fragment_mask_fetch) { - unsigned sample_chan = instr->is_array ? 3 : 2; - args.coords[sample_chan] = adjust_sample_index_using_fmask( - &ctx->ac, args.coords[0], args.coords[1], - instr->is_array ? args.coords[2] : NULL, - args.coords[sample_chan], fmask_ptr); - } - - if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { - int num_offsets = instr->src[offset_src].src.ssa->num_components; - num_offsets = MIN2(num_offsets, instr->coord_components); - for (unsigned i = 0; i < num_offsets; ++i) { - args.coords[i] = LLVMBuildAdd( - ctx->ac.builder, args.coords[i], - LLVMConstInt(ctx->ac.i32, nir_src_comp_as_uint(instr->src[offset_src].src, i), false), ""); - } - args.offset = NULL; - } - - /* DMASK was repurposed for GATHER4. 4 components are always - * returned and DMASK works like a swizzle - it selects - * the component to fetch. The only valid DMASK values are - * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - * (red,red,red,red) etc.) The ISA document doesn't mention - * this. - */ - args.dmask = 0xf; - if (instr->op == nir_texop_tg4) { - if (instr->is_shadow) - args.dmask = 1; - else - args.dmask = 1 << instr->component; - } - - if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) { - args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array); - args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT; - } - - /* Adjust the number of coordinates because we only need (x,y) for 2D - * multisampled images and (x,y,layer) for 2D multisampled layered - * images or for multisampled input attachments. - */ - if (instr->op == nir_texop_fragment_mask_fetch) { - if (args.dim == ac_image_2dmsaa) { - args.dim = ac_image_2d; - } else { - assert(args.dim == ac_image_2darraymsaa); - args.dim = ac_image_2darray; - } - } - - assert(instr->dest.is_ssa); - args.d16 = instr->dest.ssa.bit_size == 16; - - result = build_tex_intrinsic(ctx, instr, &args); - - if (instr->op == nir_texop_query_levels) - result = LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), ""); - else if (instr->is_shadow && instr->is_new_style_shadow && - instr->op != nir_texop_txs && instr->op != nir_texop_lod && - instr->op != nir_texop_tg4) - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - else if (instr->op == nir_texop_txs && - instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && - instr->is_array) { - LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); - LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); - LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); - z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); - result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, ""); - } else if (ctx->ac.chip_class == GFX9 && - instr->op == nir_texop_txs && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->is_array) { - LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); - LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); - result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, - ctx->ac.i32_1, ""); - } else if (instr->dest.ssa.num_components != 4) - result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components); + LLVMValueRef result = NULL; + struct ac_image_args args = {0}; + LLVMValueRef fmask_ptr = NULL, sample_index = NULL; + LLVMValueRef ddx = NULL, ddy = NULL; + unsigned offset_src = 0; + struct waterfall_context wctx[2] = {{{0}}}; + + tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr); + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: { + LLVMValueRef coord = get_src(ctx, instr->src[i].src); + for (unsigned chan = 0; chan < instr->coord_components; ++chan) + args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan); + break; + } + case nir_tex_src_projector: + break; + case nir_tex_src_comparator: + if (instr->is_shadow) { + args.compare = get_src(ctx, instr->src[i].src); + args.compare = ac_to_float(&ctx->ac, args.compare); + } + break; + case nir_tex_src_offset: + args.offset = get_src(ctx, instr->src[i].src); + offset_src = i; + break; + case nir_tex_src_bias: + args.bias = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_lod: { + if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) + args.level_zero = true; + else + args.lod = get_src(ctx, instr->src[i].src); + break; + } + case nir_tex_src_ms_index: + sample_index = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_ms_mcs: + break; + case nir_tex_src_ddx: + ddx = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_ddy: + ddy = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_min_lod: + args.min_lod = get_src(ctx, instr->src[i].src); + break; + case nir_tex_src_texture_offset: + case nir_tex_src_sampler_offset: + case nir_tex_src_plane: + default: + break; + } + } + + if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + result = get_buffer_size(ctx, args.resource, true); + goto write_result; + } + + if (instr->op == nir_texop_texture_samples) { + LLVMValueRef res, samples, is_msaa; + LLVMValueRef default_sample; + + res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, ""); + samples = + LLVMBuildExtractElement(ctx->ac.builder, res, LLVMConstInt(ctx->ac.i32, 3, false), ""); + is_msaa = LLVMBuildLShr(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 28, false), ""); + is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa, LLVMConstInt(ctx->ac.i32, 0xe, false), ""); + is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa, + LLVMConstInt(ctx->ac.i32, 0xe, false), ""); + + samples = LLVMBuildLShr(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 16, false), ""); + samples = LLVMBuildAnd(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 0xf, false), ""); + samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, samples, ""); + + if (ctx->abi->robust_buffer_access) { + LLVMValueRef dword1, is_null_descriptor; + + /* Extract the second dword of the descriptor, if it's + * all zero, then it's a null descriptor. + */ + dword1 = + LLVMBuildExtractElement(ctx->ac.builder, res, LLVMConstInt(ctx->ac.i32, 1, false), ""); + is_null_descriptor = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1, + LLVMConstInt(ctx->ac.i32, 0, false), ""); + default_sample = + LLVMBuildSelect(ctx->ac.builder, is_null_descriptor, ctx->ac.i32_0, ctx->ac.i32_1, ""); + } else { + default_sample = ctx->ac.i32_1; + } + + samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples, default_sample, ""); + result = samples; + goto write_result; + } + + if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { + LLVMValueRef offset[3], pack; + for (unsigned chan = 0; chan < 3; ++chan) + offset[chan] = ctx->ac.i32_0; + + unsigned num_components = ac_get_llvm_num_components(args.offset); + for (unsigned chan = 0; chan < num_components; chan++) { + offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan); + offset[chan] = + LLVMBuildAnd(ctx->ac.builder, offset[chan], LLVMConstInt(ctx->ac.i32, 0x3f, false), ""); + if (chan) + offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan], + LLVMConstInt(ctx->ac.i32, chan * 8, false), ""); + } + pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], ""); + pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], ""); + args.offset = pack; + } + + /* Section 8.23.1 (Depth Texture Comparison Mode) of the + * OpenGL 4.5 spec says: + * + * "If the texture’s internal format indicates a fixed-point + * depth texture, then D_t and D_ref are clamped to the + * range [0, 1]; otherwise no clamping is performed." + * + * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT, + * so the depth comparison value isn't clamped for Z16 and + * Z24 anymore. Do it manually here for GFX8-9; GFX10 has + * an explicitly clamped 32-bit float format. + */ + if (args.compare && ctx->ac.chip_class >= GFX8 && ctx->ac.chip_class <= GFX9 && + ctx->abi->clamp_shadow_reference) { + LLVMValueRef upgraded, clamped; + + upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler, + LLVMConstInt(ctx->ac.i32, 3, false), ""); + upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded, LLVMConstInt(ctx->ac.i32, 29, false), ""); + upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->ac.i1, ""); + clamped = ac_build_clamp(&ctx->ac, args.compare); + args.compare = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, args.compare, ""); + } + + /* pack derivatives */ + if (ddx || ddy) { + int num_src_deriv_channels, num_dest_deriv_channels; + switch (instr->sampler_dim) { + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + num_src_deriv_channels = 3; + num_dest_deriv_channels = 3; + break; + case GLSL_SAMPLER_DIM_2D: + default: + num_src_deriv_channels = 2; + num_dest_deriv_channels = 2; + break; + case GLSL_SAMPLER_DIM_1D: + num_src_deriv_channels = 1; + if (ctx->ac.chip_class == GFX9) { + num_dest_deriv_channels = 2; + } else { + num_dest_deriv_channels = 1; + } + break; + } + + for (unsigned i = 0; i < num_src_deriv_channels; i++) { + args.derivs[i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddx, i)); + args.derivs[num_dest_deriv_channels + i] = + ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i)); + } + for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) { + args.derivs[i] = ctx->ac.f32_0; + args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0; + } + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) { + for (unsigned chan = 0; chan < instr->coord_components; chan++) + args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]); + if (instr->coord_components == 3) + args.coords[3] = LLVMGetUndef(ctx->ac.f32); + ac_prepare_cube_coords(&ctx->ac, instr->op == nir_texop_txd, instr->is_array, + instr->op == nir_texop_lod, args.coords, args.derivs); + } + + /* Texture coordinates fixups */ + if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array && instr->op != nir_texop_txf) { + args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]); + } + + if (instr->coord_components > 2 && + (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms && + instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch) { + args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]); + } + + if (ctx->ac.chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->op != nir_texop_lod) { + LLVMValueRef filler; + if (instr->op == nir_texop_txf) + filler = ctx->ac.i32_0; + else + filler = LLVMConstReal(ctx->ac.f32, 0.5); + + if (instr->is_array) + args.coords[2] = args.coords[1]; + args.coords[1] = filler; + } + + /* Pack sample index */ + if (sample_index && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_fragment_fetch)) + args.coords[instr->coord_components] = sample_index; + + if (instr->op == nir_texop_samples_identical) { + struct ac_image_args txf_args = {0}; + memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords)); + + txf_args.dmask = 0xf; + txf_args.resource = fmask_ptr; + txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d; + result = build_tex_intrinsic(ctx, instr, &txf_args); + + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0); + goto write_result; + } + + if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_MS) && + instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) { + unsigned sample_chan = instr->is_array ? 3 : 2; + args.coords[sample_chan] = adjust_sample_index_using_fmask( + &ctx->ac, args.coords[0], args.coords[1], instr->is_array ? args.coords[2] : NULL, + args.coords[sample_chan], fmask_ptr); + } + + if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { + int num_offsets = instr->src[offset_src].src.ssa->num_components; + num_offsets = MIN2(num_offsets, instr->coord_components); + for (unsigned i = 0; i < num_offsets; ++i) { + args.coords[i] = LLVMBuildAdd( + ctx->ac.builder, args.coords[i], + LLVMConstInt(ctx->ac.i32, nir_src_comp_as_uint(instr->src[offset_src].src, i), false), + ""); + } + args.offset = NULL; + } + + /* DMASK was repurposed for GATHER4. 4 components are always + * returned and DMASK works like a swizzle - it selects + * the component to fetch. The only valid DMASK values are + * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + * (red,red,red,red) etc.) The ISA document doesn't mention + * this. + */ + args.dmask = 0xf; + if (instr->op == nir_texop_tg4) { + if (instr->is_shadow) + args.dmask = 1; + else + args.dmask = 1 << instr->component; + } + + if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) { + args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array); + args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT; + } + + /* Adjust the number of coordinates because we only need (x,y) for 2D + * multisampled images and (x,y,layer) for 2D multisampled layered + * images or for multisampled input attachments. + */ + if (instr->op == nir_texop_fragment_mask_fetch) { + if (args.dim == ac_image_2dmsaa) { + args.dim = ac_image_2d; + } else { + assert(args.dim == ac_image_2darraymsaa); + args.dim = ac_image_2darray; + } + } + + assert(instr->dest.is_ssa); + args.d16 = instr->dest.ssa.bit_size == 16; + + result = build_tex_intrinsic(ctx, instr, &args); + + if (instr->op == nir_texop_query_levels) + result = + LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), ""); + else if (instr->is_shadow && instr->is_new_style_shadow && instr->op != nir_texop_txs && + instr->op != nir_texop_lod && instr->op != nir_texop_tg4) + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); + else if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + instr->is_array) { + LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false); + LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); + z = LLVMBuildSDiv(ctx->ac.builder, z, six, ""); + result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, ""); + } else if (ctx->ac.chip_class == GFX9 && instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) { + LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, ""); + result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, ctx->ac.i32_1, ""); + } else if (instr->dest.ssa.num_components != 4) + result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components); write_result: - if (result) { - assert(instr->dest.is_ssa); - result = ac_to_integer(&ctx->ac, result); + if (result) { + assert(instr->dest.is_ssa); + result = ac_to_integer(&ctx->ac, result); - for (int i = ARRAY_SIZE(wctx); --i >= 0;) { - result = exit_waterfall(ctx, wctx + i, result); - } + for (int i = ARRAY_SIZE(wctx); --i >= 0;) { + result = exit_waterfall(ctx, wctx + i, result); + } - ctx->ssa_defs[instr->dest.ssa.index] = result; - } + ctx->ssa_defs[instr->dest.ssa.index] = result; + } } static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr) { - LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa); - LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, ""); + LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa); + LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, ""); - ctx->ssa_defs[instr->dest.ssa.index] = result; - _mesa_hash_table_insert(ctx->phis, instr, result); + ctx->ssa_defs[instr->dest.ssa.index] = result; + _mesa_hash_table_insert(ctx->phis, instr, result); } -static void visit_post_phi(struct ac_nir_context *ctx, - nir_phi_instr *instr, - LLVMValueRef llvm_phi) +static void visit_post_phi(struct ac_nir_context *ctx, nir_phi_instr *instr, LLVMValueRef llvm_phi) { - nir_foreach_phi_src(src, instr) { - LLVMBasicBlockRef block = get_block(ctx, src->pred); - LLVMValueRef llvm_src = get_src(ctx, src->src); + nir_foreach_phi_src (src, instr) { + LLVMBasicBlockRef block = get_block(ctx, src->pred); + LLVMValueRef llvm_src = get_src(ctx, src->src); - LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1); - } + LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1); + } } static void phi_post_pass(struct ac_nir_context *ctx) { - hash_table_foreach(ctx->phis, entry) { - visit_post_phi(ctx, (nir_phi_instr*)entry->key, - (LLVMValueRef)entry->data); - } + hash_table_foreach(ctx->phis, entry) + { + visit_post_phi(ctx, (nir_phi_instr *)entry->key, (LLVMValueRef)entry->data); + } } - -static bool is_def_used_in_an_export(const nir_ssa_def* def) { - nir_foreach_use(use_src, def) { - if (use_src->parent_instr->type == nir_instr_type_intrinsic) { - nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr); - if (instr->intrinsic == nir_intrinsic_store_deref) - return true; - } else if (use_src->parent_instr->type == nir_instr_type_alu) { - nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr); - if (instr->op == nir_op_vec4 && - is_def_used_in_an_export(&instr->dest.dest.ssa)) { - return true; - } - } - } - return false; +static bool is_def_used_in_an_export(const nir_ssa_def *def) +{ + nir_foreach_use (use_src, def) { + if (use_src->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr); + if (instr->intrinsic == nir_intrinsic_store_deref) + return true; + } else if (use_src->parent_instr->type == nir_instr_type_alu) { + nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr); + if (instr->op == nir_op_vec4 && is_def_used_in_an_export(&instr->dest.dest.ssa)) { + return true; + } + } + } + return false; } -static void visit_ssa_undef(struct ac_nir_context *ctx, - const nir_ssa_undef_instr *instr) +static void visit_ssa_undef(struct ac_nir_context *ctx, const nir_ssa_undef_instr *instr) { - unsigned num_components = instr->def.num_components; - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); - - if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) { - LLVMValueRef undef; - - if (num_components == 1) - undef = LLVMGetUndef(type); - else { - undef = LLVMGetUndef(LLVMVectorType(type, num_components)); - } - ctx->ssa_defs[instr->def.index] = undef; - } else { - LLVMValueRef zero = LLVMConstInt(type, 0, false); - if (num_components > 1) { - zero = ac_build_gather_values_extended( - &ctx->ac, &zero, 4, 0, false, false); - } - ctx->ssa_defs[instr->def.index] = zero; - } + unsigned num_components = instr->def.num_components; + LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size); + + if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) { + LLVMValueRef undef; + + if (num_components == 1) + undef = LLVMGetUndef(type); + else { + undef = LLVMGetUndef(LLVMVectorType(type, num_components)); + } + ctx->ssa_defs[instr->def.index] = undef; + } else { + LLVMValueRef zero = LLVMConstInt(type, 0, false); + if (num_components > 1) { + zero = ac_build_gather_values_extended(&ctx->ac, &zero, 4, 0, false, false); + } + ctx->ssa_defs[instr->def.index] = zero; + } } -static void visit_jump(struct ac_llvm_context *ctx, - const nir_jump_instr *instr) +static void visit_jump(struct ac_llvm_context *ctx, const nir_jump_instr *instr) { - switch (instr->type) { - case nir_jump_break: - ac_build_break(ctx); - break; - case nir_jump_continue: - ac_build_continue(ctx); - break; - default: - fprintf(stderr, "Unknown NIR jump instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - abort(); - } + switch (instr->type) { + case nir_jump_break: + ac_build_break(ctx); + break; + case nir_jump_continue: + ac_build_continue(ctx); + break; + default: + fprintf(stderr, "Unknown NIR jump instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } } -static LLVMTypeRef -glsl_base_to_llvm_type(struct ac_llvm_context *ac, - enum glsl_base_type type) +static LLVMTypeRef glsl_base_to_llvm_type(struct ac_llvm_context *ac, enum glsl_base_type type) { - switch (type) { - case GLSL_TYPE_INT: - case GLSL_TYPE_UINT: - case GLSL_TYPE_BOOL: - case GLSL_TYPE_SUBROUTINE: - return ac->i32; - case GLSL_TYPE_INT8: - case GLSL_TYPE_UINT8: - return ac->i8; - case GLSL_TYPE_INT16: - case GLSL_TYPE_UINT16: - return ac->i16; - case GLSL_TYPE_FLOAT: - return ac->f32; - case GLSL_TYPE_FLOAT16: - return ac->f16; - case GLSL_TYPE_INT64: - case GLSL_TYPE_UINT64: - return ac->i64; - case GLSL_TYPE_DOUBLE: - return ac->f64; - default: - unreachable("unknown GLSL type"); - } + switch (type) { + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_SUBROUTINE: + return ac->i32; + case GLSL_TYPE_INT8: + case GLSL_TYPE_UINT8: + return ac->i8; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + return ac->i16; + case GLSL_TYPE_FLOAT: + return ac->f32; + case GLSL_TYPE_FLOAT16: + return ac->f16; + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + return ac->i64; + case GLSL_TYPE_DOUBLE: + return ac->f64; + default: + unreachable("unknown GLSL type"); + } } -static LLVMTypeRef -glsl_to_llvm_type(struct ac_llvm_context *ac, - const struct glsl_type *type) +static LLVMTypeRef glsl_to_llvm_type(struct ac_llvm_context *ac, const struct glsl_type *type) { - if (glsl_type_is_scalar(type)) { - return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); - } - - if (glsl_type_is_vector(type)) { - return LLVMVectorType( - glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), - glsl_get_vector_elements(type)); - } - - if (glsl_type_is_matrix(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_column_type(type)), - glsl_get_matrix_columns(type)); - } - - if (glsl_type_is_array(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_array_element(type)), - glsl_get_length(type)); - } - - assert(glsl_type_is_struct_or_ifc(type)); - - LLVMTypeRef member_types[glsl_get_length(type)]; - - for (unsigned i = 0; i < glsl_get_length(type); i++) { - member_types[i] = - glsl_to_llvm_type(ac, - glsl_get_struct_field(type, i)); - } - - return LLVMStructTypeInContext(ac->context, member_types, - glsl_get_length(type), false); + if (glsl_type_is_scalar(type)) { + return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); + } + + if (glsl_type_is_vector(type)) { + return LLVMVectorType(glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), + glsl_get_vector_elements(type)); + } + + if (glsl_type_is_matrix(type)) { + return LLVMArrayType(glsl_to_llvm_type(ac, glsl_get_column_type(type)), + glsl_get_matrix_columns(type)); + } + + if (glsl_type_is_array(type)) { + return LLVMArrayType(glsl_to_llvm_type(ac, glsl_get_array_element(type)), + glsl_get_length(type)); + } + + assert(glsl_type_is_struct_or_ifc(type)); + + LLVMTypeRef member_types[glsl_get_length(type)]; + + for (unsigned i = 0; i < glsl_get_length(type); i++) { + member_types[i] = glsl_to_llvm_type(ac, glsl_get_struct_field(type, i)); + } + + return LLVMStructTypeInContext(ac->context, member_types, glsl_get_length(type), false); } -static void visit_deref(struct ac_nir_context *ctx, - nir_deref_instr *instr) +static void visit_deref(struct ac_nir_context *ctx, nir_deref_instr *instr) { - if (instr->mode != nir_var_mem_shared && - instr->mode != nir_var_mem_global) - return; - - LLVMValueRef result = NULL; - switch(instr->deref_type) { - case nir_deref_type_var: { - struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var); - result = entry->data; - break; - } - case nir_deref_type_struct: - if (instr->mode == nir_var_mem_global) { - nir_deref_instr *parent = nir_deref_instr_parent(instr); - uint64_t offset = glsl_get_struct_field_offset(parent->type, - instr->strct.index); - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), - LLVMConstInt(ctx->ac.i32, offset, 0)); - } else { - result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), - LLVMConstInt(ctx->ac.i32, instr->strct.index, 0)); - } - break; - case nir_deref_type_array: - if (instr->mode == nir_var_mem_global) { - nir_deref_instr *parent = nir_deref_instr_parent(instr); - unsigned stride = glsl_get_explicit_stride(parent->type); - - if ((glsl_type_is_matrix(parent->type) && - glsl_matrix_type_is_row_major(parent->type)) || - (glsl_type_is_vector(parent->type) && stride == 0)) - stride = type_scalar_size_bytes(parent->type); - - assert(stride > 0); - LLVMValueRef index = get_src(ctx, instr->arr.index); - if (LLVMTypeOf(index) != ctx->ac.i64) - index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); - - LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); - - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); - } else { - result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), - get_src(ctx, instr->arr.index)); - } - break; - case nir_deref_type_ptr_as_array: - if (instr->mode == nir_var_mem_global) { - unsigned stride = nir_deref_instr_array_stride(instr); - - LLVMValueRef index = get_src(ctx, instr->arr.index); - if (LLVMTypeOf(index) != ctx->ac.i64) - index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); - - LLVMValueRef offset = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); - - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); - } else { - result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), - get_src(ctx, instr->arr.index)); - } - break; - case nir_deref_type_cast: { - result = get_src(ctx, instr->parent); - - /* We can't use the structs from LLVM because the shader - * specifies its own offsets. */ - LLVMTypeRef pointee_type = ctx->ac.i8; - if (instr->mode == nir_var_mem_shared) - pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type); - - unsigned address_space; - - switch(instr->mode) { - case nir_var_mem_shared: - address_space = AC_ADDR_SPACE_LDS; - break; - case nir_var_mem_global: - address_space = AC_ADDR_SPACE_GLOBAL; - break; - default: - unreachable("Unhandled address space"); - } - - LLVMTypeRef type = LLVMPointerType(pointee_type, address_space); - - if (LLVMTypeOf(result) != type) { - if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) { - result = LLVMBuildBitCast(ctx->ac.builder, result, - type, ""); - } else { - result = LLVMBuildIntToPtr(ctx->ac.builder, result, - type, ""); - } - } - break; - } - default: - unreachable("Unhandled deref_instr deref type"); - } - - ctx->ssa_defs[instr->dest.ssa.index] = result; + if (instr->mode != nir_var_mem_shared && instr->mode != nir_var_mem_global) + return; + + LLVMValueRef result = NULL; + switch (instr->deref_type) { + case nir_deref_type_var: { + struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var); + result = entry->data; + break; + } + case nir_deref_type_struct: + if (instr->mode == nir_var_mem_global) { + nir_deref_instr *parent = nir_deref_instr_parent(instr); + uint64_t offset = glsl_get_struct_field_offset(parent->type, instr->strct.index); + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), + LLVMConstInt(ctx->ac.i32, offset, 0)); + } else { + result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), + LLVMConstInt(ctx->ac.i32, instr->strct.index, 0)); + } + break; + case nir_deref_type_array: + if (instr->mode == nir_var_mem_global) { + nir_deref_instr *parent = nir_deref_instr_parent(instr); + unsigned stride = glsl_get_explicit_stride(parent->type); + + if ((glsl_type_is_matrix(parent->type) && glsl_matrix_type_is_row_major(parent->type)) || + (glsl_type_is_vector(parent->type) && stride == 0)) + stride = type_scalar_size_bytes(parent->type); + + assert(stride > 0); + LLVMValueRef index = get_src(ctx, instr->arr.index); + if (LLVMTypeOf(index) != ctx->ac.i64) + index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); + + LLVMValueRef offset = + LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); + + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); + } else { + result = + ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index)); + } + break; + case nir_deref_type_ptr_as_array: + if (instr->mode == nir_var_mem_global) { + unsigned stride = nir_deref_instr_array_stride(instr); + + LLVMValueRef index = get_src(ctx, instr->arr.index); + if (LLVMTypeOf(index) != ctx->ac.i64) + index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, ""); + + LLVMValueRef offset = + LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), ""); + + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset); + } else { + result = + ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index)); + } + break; + case nir_deref_type_cast: { + result = get_src(ctx, instr->parent); + + /* We can't use the structs from LLVM because the shader + * specifies its own offsets. */ + LLVMTypeRef pointee_type = ctx->ac.i8; + if (instr->mode == nir_var_mem_shared) + pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type); + + unsigned address_space; + + switch (instr->mode) { + case nir_var_mem_shared: + address_space = AC_ADDR_SPACE_LDS; + break; + case nir_var_mem_global: + address_space = AC_ADDR_SPACE_GLOBAL; + break; + default: + unreachable("Unhandled address space"); + } + + LLVMTypeRef type = LLVMPointerType(pointee_type, address_space); + + if (LLVMTypeOf(result) != type) { + if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) { + result = LLVMBuildBitCast(ctx->ac.builder, result, type, ""); + } else { + result = LLVMBuildIntToPtr(ctx->ac.builder, result, type, ""); + } + } + break; + } + default: + unreachable("Unhandled deref_instr deref type"); + } + + ctx->ssa_defs[instr->dest.ssa.index] = result; } -static void visit_cf_list(struct ac_nir_context *ctx, - struct exec_list *list); +static void visit_cf_list(struct ac_nir_context *ctx, struct exec_list *list); static void visit_block(struct ac_nir_context *ctx, nir_block *block) { - nir_foreach_instr(instr, block) - { - switch (instr->type) { - case nir_instr_type_alu: - visit_alu(ctx, nir_instr_as_alu(instr)); - break; - case nir_instr_type_load_const: - visit_load_const(ctx, nir_instr_as_load_const(instr)); - break; - case nir_instr_type_intrinsic: - visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); - break; - case nir_instr_type_tex: - visit_tex(ctx, nir_instr_as_tex(instr)); - break; - case nir_instr_type_phi: - visit_phi(ctx, nir_instr_as_phi(instr)); - break; - case nir_instr_type_ssa_undef: - visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr)); - break; - case nir_instr_type_jump: - visit_jump(&ctx->ac, nir_instr_as_jump(instr)); - break; - case nir_instr_type_deref: - visit_deref(ctx, nir_instr_as_deref(instr)); - break; - default: - fprintf(stderr, "Unknown NIR instr type: "); - nir_print_instr(instr, stderr); - fprintf(stderr, "\n"); - abort(); - } - } - - _mesa_hash_table_insert(ctx->defs, block, - LLVMGetInsertBlock(ctx->ac.builder)); + nir_foreach_instr (instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + visit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + visit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + visit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + visit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_ssa_undef: + visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_jump: + visit_jump(&ctx->ac, nir_instr_as_jump(instr)); + break; + case nir_instr_type_deref: + visit_deref(ctx, nir_instr_as_deref(instr)); + break; + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + } + + _mesa_hash_table_insert(ctx->defs, block, LLVMGetInsertBlock(ctx->ac.builder)); } static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt) { - LLVMValueRef value = get_src(ctx, if_stmt->condition); + LLVMValueRef value = get_src(ctx, if_stmt->condition); - nir_block *then_block = - (nir_block *) exec_list_get_head(&if_stmt->then_list); + nir_block *then_block = (nir_block *)exec_list_get_head(&if_stmt->then_list); - ac_build_uif(&ctx->ac, value, then_block->index); + ac_build_uif(&ctx->ac, value, then_block->index); - visit_cf_list(ctx, &if_stmt->then_list); + visit_cf_list(ctx, &if_stmt->then_list); - if (!exec_list_is_empty(&if_stmt->else_list)) { - nir_block *else_block = - (nir_block *) exec_list_get_head(&if_stmt->else_list); + if (!exec_list_is_empty(&if_stmt->else_list)) { + nir_block *else_block = (nir_block *)exec_list_get_head(&if_stmt->else_list); - ac_build_else(&ctx->ac, else_block->index); - visit_cf_list(ctx, &if_stmt->else_list); - } + ac_build_else(&ctx->ac, else_block->index); + visit_cf_list(ctx, &if_stmt->else_list); + } - ac_build_endif(&ctx->ac, then_block->index); + ac_build_endif(&ctx->ac, then_block->index); } static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop) { - nir_block *first_loop_block = - (nir_block *) exec_list_get_head(&loop->body); + nir_block *first_loop_block = (nir_block *)exec_list_get_head(&loop->body); - ac_build_bgnloop(&ctx->ac, first_loop_block->index); + ac_build_bgnloop(&ctx->ac, first_loop_block->index); - visit_cf_list(ctx, &loop->body); + visit_cf_list(ctx, &loop->body); - ac_build_endloop(&ctx->ac, first_loop_block->index); + ac_build_endloop(&ctx->ac, first_loop_block->index); } -static void visit_cf_list(struct ac_nir_context *ctx, - struct exec_list *list) +static void visit_cf_list(struct ac_nir_context *ctx, struct exec_list *list) { - foreach_list_typed(nir_cf_node, node, node, list) - { - switch (node->type) { - case nir_cf_node_block: - visit_block(ctx, nir_cf_node_as_block(node)); - break; - - case nir_cf_node_if: - visit_if(ctx, nir_cf_node_as_if(node)); - break; - - case nir_cf_node_loop: - visit_loop(ctx, nir_cf_node_as_loop(node)); - break; - - default: - assert(0); - } - } + foreach_list_typed(nir_cf_node, node, node, list) + { + switch (node->type) { + case nir_cf_node_block: + visit_block(ctx, nir_cf_node_as_block(node)); + break; + + case nir_cf_node_if: + visit_if(ctx, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + visit_loop(ctx, nir_cf_node_as_loop(node)); + break; + + default: + assert(0); + } + } } -void -ac_handle_shader_output_decl(struct ac_llvm_context *ctx, - struct ac_shader_abi *abi, - struct nir_shader *nir, - struct nir_variable *variable, - gl_shader_stage stage) +void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi, + struct nir_shader *nir, struct nir_variable *variable, + gl_shader_stage stage) { - unsigned output_loc = variable->data.driver_location / 4; - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - - /* tess ctrl has it's own load/store paths for outputs */ - if (stage == MESA_SHADER_TESS_CTRL) - return; - - if (stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL || - stage == MESA_SHADER_GEOMETRY) { - int idx = variable->data.location + variable->data.index; - if (idx == VARYING_SLOT_CLIP_DIST0) { - int length = nir->info.clip_distance_array_size + - nir->info.cull_distance_array_size; - - if (length > 4) - attrib_count = 2; - else - attrib_count = 1; - } - } - - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32; - for (unsigned i = 0; i < attrib_count; ++i) { - for (unsigned chan = 0; chan < 4; chan++) { - abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] = - ac_build_alloca_undef(ctx, type, ""); - } - } + unsigned output_loc = variable->data.driver_location / 4; + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + + /* tess ctrl has it's own load/store paths for outputs */ + if (stage == MESA_SHADER_TESS_CTRL) + return; + + if (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY) { + int idx = variable->data.location + variable->data.index; + if (idx == VARYING_SLOT_CLIP_DIST0) { + int length = nir->info.clip_distance_array_size + nir->info.cull_distance_array_size; + + if (length > 4) + attrib_count = 2; + else + attrib_count = 1; + } + } + + bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); + LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32; + for (unsigned i = 0; i < attrib_count; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] = + ac_build_alloca_undef(ctx, type, ""); + } + } } -static void -setup_locals(struct ac_nir_context *ctx, - struct nir_function *func) +static void setup_locals(struct ac_nir_context *ctx, struct nir_function *func) { - int i, j; - ctx->num_locals = 0; - nir_foreach_function_temp_variable(variable, func->impl) { - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - variable->data.driver_location = ctx->num_locals * 4; - variable->data.location_frac = 0; - ctx->num_locals += attrib_count; - } - ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef)); - if (!ctx->locals) - return; - - for (i = 0; i < ctx->num_locals; i++) { - for (j = 0; j < 4; j++) { - ctx->locals[i * 4 + j] = - ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp"); - } - } + int i, j; + ctx->num_locals = 0; + nir_foreach_function_temp_variable(variable, func->impl) + { + unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); + variable->data.driver_location = ctx->num_locals * 4; + variable->data.location_frac = 0; + ctx->num_locals += attrib_count; + } + ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef)); + if (!ctx->locals) + return; + + for (i = 0; i < ctx->num_locals; i++) { + for (j = 0; j < 4; j++) { + ctx->locals[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp"); + } + } } -static void -setup_scratch(struct ac_nir_context *ctx, - struct nir_shader *shader) +static void setup_scratch(struct ac_nir_context *ctx, struct nir_shader *shader) { - if (shader->scratch_size == 0) - return; + if (shader->scratch_size == 0) + return; - ctx->scratch = ac_build_alloca_undef(&ctx->ac, - LLVMArrayType(ctx->ac.i8, shader->scratch_size), - "scratch"); + ctx->scratch = + ac_build_alloca_undef(&ctx->ac, LLVMArrayType(ctx->ac.i8, shader->scratch_size), "scratch"); } -static void -setup_constant_data(struct ac_nir_context *ctx, - struct nir_shader *shader) +static void setup_constant_data(struct ac_nir_context *ctx, struct nir_shader *shader) { - if (!shader->constant_data) - return; - - LLVMValueRef data = - LLVMConstStringInContext(ctx->ac.context, - shader->constant_data, - shader->constant_data_size, - true); - LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, shader->constant_data_size); - - /* We want to put the constant data in the CONST address space so that - * we can use scalar loads. However, LLVM versions before 10 put these - * variables in the same section as the code, which is unacceptable - * for RadeonSI as it needs to relocate all the data sections after - * the code sections. See https://reviews.llvm.org/D65813. - */ - unsigned address_space = - LLVM_VERSION_MAJOR < 10 ? AC_ADDR_SPACE_GLOBAL : AC_ADDR_SPACE_CONST; - - LLVMValueRef global = - LLVMAddGlobalInAddressSpace(ctx->ac.module, type, - "const_data", - address_space); - - LLVMSetInitializer(global, data); - LLVMSetGlobalConstant(global, true); - LLVMSetVisibility(global, LLVMHiddenVisibility); - ctx->constant_data = global; + if (!shader->constant_data) + return; + + LLVMValueRef data = LLVMConstStringInContext(ctx->ac.context, shader->constant_data, + shader->constant_data_size, true); + LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, shader->constant_data_size); + + /* We want to put the constant data in the CONST address space so that + * we can use scalar loads. However, LLVM versions before 10 put these + * variables in the same section as the code, which is unacceptable + * for RadeonSI as it needs to relocate all the data sections after + * the code sections. See https://reviews.llvm.org/D65813. + */ + unsigned address_space = LLVM_VERSION_MAJOR < 10 ? AC_ADDR_SPACE_GLOBAL : AC_ADDR_SPACE_CONST; + + LLVMValueRef global = + LLVMAddGlobalInAddressSpace(ctx->ac.module, type, "const_data", address_space); + + LLVMSetInitializer(global, data); + LLVMSetGlobalConstant(global, true); + LLVMSetVisibility(global, LLVMHiddenVisibility); + ctx->constant_data = global; } -static void -setup_shared(struct ac_nir_context *ctx, - struct nir_shader *nir) +static void setup_shared(struct ac_nir_context *ctx, struct nir_shader *nir) { - if (ctx->ac.lds) - return; + if (ctx->ac.lds) + return; - LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, - nir->info.cs.shared_size); + LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, nir->info.cs.shared_size); - LLVMValueRef lds = - LLVMAddGlobalInAddressSpace(ctx->ac.module, type, - "compute_lds", - AC_ADDR_SPACE_LDS); - LLVMSetAlignment(lds, 64 * 1024); + LLVMValueRef lds = + LLVMAddGlobalInAddressSpace(ctx->ac.module, type, "compute_lds", AC_ADDR_SPACE_LDS); + LLVMSetAlignment(lds, 64 * 1024); - ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, lds, - LLVMPointerType(ctx->ac.i8, - AC_ADDR_SPACE_LDS), ""); + ctx->ac.lds = + LLVMBuildBitCast(ctx->ac.builder, lds, LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS), ""); } void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, - const struct ac_shader_args *args, struct nir_shader *nir) + const struct ac_shader_args *args, struct nir_shader *nir) { - struct ac_nir_context ctx = {}; - struct nir_function *func; - - ctx.ac = *ac; - ctx.abi = abi; - ctx.args = args; - - ctx.stage = nir->info.stage; - ctx.info = &nir->info; - - ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); - - /* TODO: remove this after RADV switches to lowered IO */ - if (!nir->info.io_lowered) { - nir_foreach_shader_out_variable(variable, nir) { - ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, - ctx.stage); - } - } - - ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - if (ctx.abi->kill_ps_if_inf_interp) - ctx.verified_interp = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - func = (struct nir_function *)exec_list_get_head(&nir->functions); - - nir_index_ssa_defs(func->impl); - ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef)); - - setup_locals(&ctx, func); - setup_scratch(&ctx, nir); - setup_constant_data(&ctx, nir); - - if (gl_shader_stage_is_compute(nir->info.stage)) - setup_shared(&ctx, nir); - - if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_demote) { - ctx.ac.postponed_kill = ac_build_alloca_undef(&ctx.ac, ac->i1, ""); - /* true = don't kill. */ - LLVMBuildStore(ctx.ac.builder, ctx.ac.i1true, ctx.ac.postponed_kill); - } - - visit_cf_list(&ctx, &func->impl->body); - phi_post_pass(&ctx); - - if (ctx.ac.postponed_kill) - ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder, - ctx.ac.postponed_kill, "")); - - if (!gl_shader_stage_is_compute(nir->info.stage)) - ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS, - ctx.abi->outputs); - - free(ctx.locals); - free(ctx.ssa_defs); - ralloc_free(ctx.defs); - ralloc_free(ctx.phis); - ralloc_free(ctx.vars); - if (ctx.abi->kill_ps_if_inf_interp) - ralloc_free(ctx.verified_interp); + struct ac_nir_context ctx = {}; + struct nir_function *func; + + ctx.ac = *ac; + ctx.abi = abi; + ctx.args = args; + + ctx.stage = nir->info.stage; + ctx.info = &nir->info; + + ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); + + /* TODO: remove this after RADV switches to lowered IO */ + if (!nir->info.io_lowered) { + nir_foreach_shader_out_variable(variable, nir) + { + ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, ctx.stage); + } + } + + ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + + if (ctx.abi->kill_ps_if_inf_interp) + ctx.verified_interp = + _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + + func = (struct nir_function *)exec_list_get_head(&nir->functions); + + nir_index_ssa_defs(func->impl); + ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef)); + + setup_locals(&ctx, func); + setup_scratch(&ctx, nir); + setup_constant_data(&ctx, nir); + + if (gl_shader_stage_is_compute(nir->info.stage)) + setup_shared(&ctx, nir); + + if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_demote) { + ctx.ac.postponed_kill = ac_build_alloca_undef(&ctx.ac, ac->i1, ""); + /* true = don't kill. */ + LLVMBuildStore(ctx.ac.builder, ctx.ac.i1true, ctx.ac.postponed_kill); + } + + visit_cf_list(&ctx, &func->impl->body); + phi_post_pass(&ctx); + + if (ctx.ac.postponed_kill) + ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder, ctx.ac.postponed_kill, "")); + + if (!gl_shader_stage_is_compute(nir->info.stage)) + ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS, ctx.abi->outputs); + + free(ctx.locals); + free(ctx.ssa_defs); + ralloc_free(ctx.defs); + ralloc_free(ctx.phis); + ralloc_free(ctx.vars); + if (ctx.abi->kill_ps_if_inf_interp) + ralloc_free(ctx.verified_interp); } -bool -ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class) +bool ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class) { - bool progress = false; - - /* Lower large variables to scratch first so that we won't bloat the - * shader by generating large if ladders for them. We later lower - * scratch to alloca's, assuming LLVM won't generate VGPR indexing. - */ - NIR_PASS(progress, nir, nir_lower_vars_to_scratch, - nir_var_function_temp, - 256, - glsl_get_natural_size_align_bytes); - - /* While it would be nice not to have this flag, we are constrained - * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. - */ - bool llvm_has_working_vgpr_indexing = chip_class != GFX9; - - /* TODO: Indirect indexing of GS inputs is unimplemented. - * - * TCS and TES load inputs directly from LDS or offchip memory, so - * indirect indexing is trivial. - */ - nir_variable_mode indirect_mask = 0; - if (nir->info.stage == MESA_SHADER_GEOMETRY || - (nir->info.stage != MESA_SHADER_TESS_CTRL && - nir->info.stage != MESA_SHADER_TESS_EVAL && - !llvm_has_working_vgpr_indexing)) { - indirect_mask |= nir_var_shader_in; - } - if (!llvm_has_working_vgpr_indexing && - nir->info.stage != MESA_SHADER_TESS_CTRL) - indirect_mask |= nir_var_shader_out; - - /* TODO: We shouldn't need to do this, however LLVM isn't currently - * smart enough to handle indirects without causing excess spilling - * causing the gpu to hang. - * - * See the following thread for more details of the problem: - * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html - */ - indirect_mask |= nir_var_function_temp; - - progress |= nir_lower_indirect_derefs(nir, indirect_mask, UINT32_MAX); - return progress; + bool progress = false; + + /* Lower large variables to scratch first so that we won't bloat the + * shader by generating large if ladders for them. We later lower + * scratch to alloca's, assuming LLVM won't generate VGPR indexing. + */ + NIR_PASS(progress, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, + glsl_get_natural_size_align_bytes); + + /* While it would be nice not to have this flag, we are constrained + * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. + */ + bool llvm_has_working_vgpr_indexing = chip_class != GFX9; + + /* TODO: Indirect indexing of GS inputs is unimplemented. + * + * TCS and TES load inputs directly from LDS or offchip memory, so + * indirect indexing is trivial. + */ + nir_variable_mode indirect_mask = 0; + if (nir->info.stage == MESA_SHADER_GEOMETRY || + (nir->info.stage != MESA_SHADER_TESS_CTRL && nir->info.stage != MESA_SHADER_TESS_EVAL && + !llvm_has_working_vgpr_indexing)) { + indirect_mask |= nir_var_shader_in; + } + if (!llvm_has_working_vgpr_indexing && nir->info.stage != MESA_SHADER_TESS_CTRL) + indirect_mask |= nir_var_shader_out; + + /* TODO: We shouldn't need to do this, however LLVM isn't currently + * smart enough to handle indirects without causing excess spilling + * causing the gpu to hang. + * + * See the following thread for more details of the problem: + * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html + */ + indirect_mask |= nir_var_function_temp; + + progress |= nir_lower_indirect_derefs(nir, indirect_mask, UINT32_MAX); + return progress; } -static unsigned -get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin) +static unsigned get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin) { - if (intrin->intrinsic != nir_intrinsic_store_output) - return 0; + if (intrin->intrinsic != nir_intrinsic_store_output) + return 0; - unsigned writemask = nir_intrinsic_write_mask(intrin) << - nir_intrinsic_component(intrin); - unsigned location = nir_intrinsic_io_semantics(intrin).location; + unsigned writemask = nir_intrinsic_write_mask(intrin) << nir_intrinsic_component(intrin); + unsigned location = nir_intrinsic_io_semantics(intrin).location; - if (location == VARYING_SLOT_TESS_LEVEL_OUTER) - return writemask << 4; - else if (location == VARYING_SLOT_TESS_LEVEL_INNER) - return writemask; + if (location == VARYING_SLOT_TESS_LEVEL_OUTER) + return writemask << 4; + else if (location == VARYING_SLOT_TESS_LEVEL_INNER) + return writemask; - return 0; + return 0; } -static void -scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask, - unsigned *cond_block_tf_writemask, - bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf) +static void scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask, + unsigned *cond_block_tf_writemask, + bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf) { - switch (cf_node->type) { - case nir_cf_node_block: { - nir_block *block = nir_cf_node_as_block(cf_node); - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic == nir_intrinsic_control_barrier) { - - /* If we find a barrier in nested control flow put this in the - * too hard basket. In GLSL this is not possible but it is in - * SPIR-V. - */ - if (is_nested_cf) { - *tessfactors_are_def_in_all_invocs = false; - return; - } - - /* The following case must be prevented: - * gl_TessLevelInner = ...; - * barrier(); - * if (gl_InvocationID == 1) - * gl_TessLevelInner = ...; - * - * If you consider disjoint code segments separated by barriers, each - * such segment that writes tess factor channels should write the same - * channels in all codepaths within that segment. - */ - if (upper_block_tf_writemask || cond_block_tf_writemask) { - /* Accumulate the result: */ - *tessfactors_are_def_in_all_invocs &= - !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask)); - - /* Analyze the next code segment from scratch. */ - *upper_block_tf_writemask = 0; - *cond_block_tf_writemask = 0; - } - } else - *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin); - } - - break; - } - case nir_cf_node_if: { - unsigned then_tessfactor_writemask = 0; - unsigned else_tessfactor_writemask = 0; - - nir_if *if_stmt = nir_cf_node_as_if(cf_node); - foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list) { - scan_tess_ctrl(nested_node, &then_tessfactor_writemask, - cond_block_tf_writemask, - tessfactors_are_def_in_all_invocs, true); - } - - foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list) { - scan_tess_ctrl(nested_node, &else_tessfactor_writemask, - cond_block_tf_writemask, - tessfactors_are_def_in_all_invocs, true); - } - - if (then_tessfactor_writemask || else_tessfactor_writemask) { - /* If both statements write the same tess factor channels, - * we can say that the upper block writes them too. - */ - *upper_block_tf_writemask |= then_tessfactor_writemask & - else_tessfactor_writemask; - *cond_block_tf_writemask |= then_tessfactor_writemask | - else_tessfactor_writemask; - } - - break; - } - case nir_cf_node_loop: { - nir_loop *loop = nir_cf_node_as_loop(cf_node); - foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) { - scan_tess_ctrl(nested_node, cond_block_tf_writemask, - cond_block_tf_writemask, - tessfactors_are_def_in_all_invocs, true); - } - - break; - } - default: - unreachable("unknown cf node type"); - } + switch (cf_node->type) { + case nir_cf_node_block: { + nir_block *block = nir_cf_node_as_block(cf_node); + nir_foreach_instr (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_control_barrier) { + + /* If we find a barrier in nested control flow put this in the + * too hard basket. In GLSL this is not possible but it is in + * SPIR-V. + */ + if (is_nested_cf) { + *tessfactors_are_def_in_all_invocs = false; + return; + } + + /* The following case must be prevented: + * gl_TessLevelInner = ...; + * barrier(); + * if (gl_InvocationID == 1) + * gl_TessLevelInner = ...; + * + * If you consider disjoint code segments separated by barriers, each + * such segment that writes tess factor channels should write the same + * channels in all codepaths within that segment. + */ + if (upper_block_tf_writemask || cond_block_tf_writemask) { + /* Accumulate the result: */ + *tessfactors_are_def_in_all_invocs &= + !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask)); + + /* Analyze the next code segment from scratch. */ + *upper_block_tf_writemask = 0; + *cond_block_tf_writemask = 0; + } + } else + *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin); + } + + break; + } + case nir_cf_node_if: { + unsigned then_tessfactor_writemask = 0; + unsigned else_tessfactor_writemask = 0; + + nir_if *if_stmt = nir_cf_node_as_if(cf_node); + foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list) + { + scan_tess_ctrl(nested_node, &then_tessfactor_writemask, cond_block_tf_writemask, + tessfactors_are_def_in_all_invocs, true); + } + + foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list) + { + scan_tess_ctrl(nested_node, &else_tessfactor_writemask, cond_block_tf_writemask, + tessfactors_are_def_in_all_invocs, true); + } + + if (then_tessfactor_writemask || else_tessfactor_writemask) { + /* If both statements write the same tess factor channels, + * we can say that the upper block writes them too. + */ + *upper_block_tf_writemask |= then_tessfactor_writemask & else_tessfactor_writemask; + *cond_block_tf_writemask |= then_tessfactor_writemask | else_tessfactor_writemask; + } + + break; + } + case nir_cf_node_loop: { + nir_loop *loop = nir_cf_node_as_loop(cf_node); + foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) + { + scan_tess_ctrl(nested_node, cond_block_tf_writemask, cond_block_tf_writemask, + tessfactors_are_def_in_all_invocs, true); + } + + break; + } + default: + unreachable("unknown cf node type"); + } } -bool -ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir) +bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir) { - assert(nir->info.stage == MESA_SHADER_TESS_CTRL); - - /* The pass works as follows: - * If all codepaths write tess factors, we can say that all - * invocations define tess factors. - * - * Each tess factor channel is tracked separately. - */ - unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */ - unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */ - - /* Initial value = true. Here the pass will accumulate results from - * multiple segments surrounded by barriers. If tess factors aren't - * written at all, it's a shader bug and we don't care if this will be - * true. - */ - bool tessfactors_are_def_in_all_invocs = true; - - nir_foreach_function(function, nir) { - if (function->impl) { - foreach_list_typed(nir_cf_node, node, node, &function->impl->body) { - scan_tess_ctrl(node, &main_block_tf_writemask, - &cond_block_tf_writemask, - &tessfactors_are_def_in_all_invocs, - false); - } - } - } - - /* Accumulate the result for the last code segment separated by a - * barrier. - */ - if (main_block_tf_writemask || cond_block_tf_writemask) { - tessfactors_are_def_in_all_invocs &= - !(cond_block_tf_writemask & ~main_block_tf_writemask); - } - - return tessfactors_are_def_in_all_invocs; + assert(nir->info.stage == MESA_SHADER_TESS_CTRL); + + /* The pass works as follows: + * If all codepaths write tess factors, we can say that all + * invocations define tess factors. + * + * Each tess factor channel is tracked separately. + */ + unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */ + unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */ + + /* Initial value = true. Here the pass will accumulate results from + * multiple segments surrounded by barriers. If tess factors aren't + * written at all, it's a shader bug and we don't care if this will be + * true. + */ + bool tessfactors_are_def_in_all_invocs = true; + + nir_foreach_function (function, nir) { + if (function->impl) { + foreach_list_typed(nir_cf_node, node, node, &function->impl->body) + { + scan_tess_ctrl(node, &main_block_tf_writemask, &cond_block_tf_writemask, + &tessfactors_are_def_in_all_invocs, false); + } + } + } + + /* Accumulate the result for the last code segment separated by a + * barrier. + */ + if (main_block_tf_writemask || cond_block_tf_writemask) { + tessfactors_are_def_in_all_invocs &= !(cond_block_tf_writemask & ~main_block_tf_writemask); + } + + return tessfactors_are_def_in_all_invocs; } diff --git a/src/amd/llvm/ac_nir_to_llvm.h b/src/amd/llvm/ac_nir_to_llvm.h index b4ad68abb9e..eab16252174 100644 --- a/src/amd/llvm/ac_nir_to_llvm.h +++ b/src/amd/llvm/ac_nir_to_llvm.h @@ -24,11 +24,12 @@ #ifndef AC_NIR_TO_LLVM_H #define AC_NIR_TO_LLVM_H -#include -#include "llvm-c/Core.h" -#include "llvm-c/TargetMachine.h" #include "amd_family.h" #include "compiler/shader_enums.h" +#include "llvm-c/Core.h" +#include "llvm-c/TargetMachine.h" + +#include struct nir_shader; struct nir_variable; @@ -37,13 +38,13 @@ struct ac_shader_abi; struct ac_shader_args; /* Interpolation locations */ -#define INTERP_CENTER 0 +#define INTERP_CENTER 0 #define INTERP_CENTROID 1 -#define INTERP_SAMPLE 2 +#define INTERP_SAMPLE 2 static inline unsigned ac_llvm_reg_index_soa(unsigned index, unsigned chan) { - return (index * 4) + chan; + return (index * 4) + chan; } bool ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class); @@ -51,14 +52,11 @@ bool ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class); bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir); void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi, - const struct ac_shader_args *args, struct nir_shader *nir); + const struct ac_shader_args *args, struct nir_shader *nir); -void -ac_handle_shader_output_decl(struct ac_llvm_context *ctx, - struct ac_shader_abi *abi, - struct nir_shader *nir, - struct nir_variable *variable, - gl_shader_stage stage); +void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi, + struct nir_shader *nir, struct nir_variable *variable, + gl_shader_stage stage); void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage); diff --git a/src/amd/llvm/ac_shader_abi.h b/src/amd/llvm/ac_shader_abi.h index 359e9484fc2..346c172e2d6 100644 --- a/src/amd/llvm/ac_shader_abi.h +++ b/src/amd/llvm/ac_shader_abi.h @@ -24,11 +24,11 @@ #ifndef AC_SHADER_ABI_H #define AC_SHADER_ABI_H -#include -#include #include "ac_shader_args.h" - #include "compiler/shader_enums.h" +#include + +#include struct nir_variable; @@ -36,167 +36,136 @@ struct nir_variable; #define AC_MAX_INLINE_PUSH_CONSTS 8 -enum ac_descriptor_type { - AC_DESC_IMAGE, - AC_DESC_FMASK, - AC_DESC_SAMPLER, - AC_DESC_BUFFER, - AC_DESC_PLANE_0, - AC_DESC_PLANE_1, - AC_DESC_PLANE_2, +enum ac_descriptor_type +{ + AC_DESC_IMAGE, + AC_DESC_FMASK, + AC_DESC_SAMPLER, + AC_DESC_BUFFER, + AC_DESC_PLANE_0, + AC_DESC_PLANE_1, + AC_DESC_PLANE_2, }; /* Document the shader ABI during compilation. This is what allows radeonsi and * radv to share a compiler backend. */ struct ac_shader_abi { - LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; - - /* These input registers sometimes need to be fixed up. */ - LLVMValueRef vertex_id; - LLVMValueRef instance_id; - LLVMValueRef persp_centroid, linear_centroid; - LLVMValueRef color0, color1; - LLVMValueRef user_data; - - /* For VS and PS: pre-loaded shader inputs. - * - * Currently only used for NIR shaders; indexed by variables' - * driver_location. - */ - LLVMValueRef *inputs; - - /* Varying -> attribute number mapping. Also NIR-only */ - unsigned fs_input_attr_indices[MAX_VARYING]; - - void (*emit_outputs)(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs); - - void (*emit_vertex)(struct ac_shader_abi *abi, - unsigned stream, - LLVMValueRef *addrs); - - void (*emit_primitive)(struct ac_shader_abi *abi, - unsigned stream); - - void (*emit_vertex_with_counter)(struct ac_shader_abi *abi, - unsigned stream, - LLVMValueRef vertexidx, - LLVMValueRef *addrs); - - LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - unsigned vertex_index, - unsigned const_index, - LLVMTypeRef type); - - LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_inputs); - - void (*store_tcs_outputs)(struct ac_shader_abi *abi, - const struct nir_variable *var, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - LLVMValueRef src, - unsigned writemask, - unsigned component, - unsigned driver_location); - - LLVMValueRef (*load_tess_coord)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_patch_vertices_in)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_tess_level)(struct ac_shader_abi *abi, - unsigned varying_id, - bool load_default_state); - - - LLVMValueRef (*load_ubo)(struct ac_shader_abi *abi, LLVMValueRef index); - - /** - * Load the descriptor for the given buffer. - * - * \param buffer the buffer as presented in NIR: this is the descriptor - * in Vulkan, and the buffer index in OpenGL/Gallium - * \param write whether buffer contents will be written - */ - LLVMValueRef (*load_ssbo)(struct ac_shader_abi *abi, - LLVMValueRef buffer, bool write); - - /** - * Load a descriptor associated to a sampler. - * - * \param descriptor_set the descriptor set index (only for Vulkan) - * \param base_index the base index of the sampler variable - * \param constant_index constant part of an array index (or 0, if the - * sampler variable is not an array) - * \param index non-constant part of an array index (may be NULL) - * \param desc_type the type of descriptor to load - * \param image whether the descriptor is loaded for an image operation - */ - LLVMValueRef (*load_sampler_desc)(struct ac_shader_abi *abi, - unsigned descriptor_set, - unsigned base_index, - unsigned constant_index, - LLVMValueRef index, - enum ac_descriptor_type desc_type, - bool image, bool write, - bool bindless); - - /** - * Load a Vulkan-specific resource. - * - * \param index resource index - * \param desc_set descriptor set - * \param binding descriptor set binding - */ - LLVMValueRef (*load_resource)(struct ac_shader_abi *abi, - LLVMValueRef index, - unsigned desc_set, - unsigned binding); - - LLVMValueRef (*load_sample_position)(struct ac_shader_abi *abi, - LLVMValueRef sample_id); - - LLVMValueRef (*load_local_group_size)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_sample_mask_in)(struct ac_shader_abi *abi); - - LLVMValueRef (*load_base_vertex)(struct ac_shader_abi *abi); - - LLVMValueRef (*emit_fbfetch)(struct ac_shader_abi *abi); - - /* Whether to clamp the shadow reference value to [0,1]on GFX8. Radeonsi currently - * uses it due to promoting D16 to D32, but radv needs it off. */ - bool clamp_shadow_reference; - bool interp_at_sample_force_center; - - /* Whether bounds checks are required */ - bool robust_buffer_access; - - /* Check for Inf interpolation coeff */ - bool kill_ps_if_inf_interp; - - /* Whether undef values must be converted to zero */ - bool convert_undef_to_zero; - - /* Clamp div by 0 (so it won't produce NaN) */ - bool clamp_div_by_zero; + LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; + + /* These input registers sometimes need to be fixed up. */ + LLVMValueRef vertex_id; + LLVMValueRef instance_id; + LLVMValueRef persp_centroid, linear_centroid; + LLVMValueRef color0, color1; + LLVMValueRef user_data; + + /* For VS and PS: pre-loaded shader inputs. + * + * Currently only used for NIR shaders; indexed by variables' + * driver_location. + */ + LLVMValueRef *inputs; + + /* Varying -> attribute number mapping. Also NIR-only */ + unsigned fs_input_attr_indices[MAX_VARYING]; + + void (*emit_outputs)(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); + + void (*emit_vertex)(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs); + + void (*emit_primitive)(struct ac_shader_abi *abi, unsigned stream); + + void (*emit_vertex_with_counter)(struct ac_shader_abi *abi, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs); + + LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi, unsigned location, + unsigned driver_location, unsigned component, + unsigned num_components, unsigned vertex_index, unsigned const_index, + LLVMTypeRef type); + + LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, LLVMTypeRef type, + LLVMValueRef vertex_index, LLVMValueRef param_index, + unsigned const_index, unsigned location, + unsigned driver_location, unsigned component, + unsigned num_components, bool is_patch, bool is_compact, + bool load_inputs); + + void (*store_tcs_outputs)(struct ac_shader_abi *abi, const struct nir_variable *var, + LLVMValueRef vertex_index, LLVMValueRef param_index, + unsigned const_index, LLVMValueRef src, unsigned writemask, + unsigned component, unsigned driver_location); + + LLVMValueRef (*load_tess_coord)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_patch_vertices_in)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_tess_level)(struct ac_shader_abi *abi, unsigned varying_id, + bool load_default_state); + + LLVMValueRef (*load_ubo)(struct ac_shader_abi *abi, LLVMValueRef index); + + /** + * Load the descriptor for the given buffer. + * + * \param buffer the buffer as presented in NIR: this is the descriptor + * in Vulkan, and the buffer index in OpenGL/Gallium + * \param write whether buffer contents will be written + */ + LLVMValueRef (*load_ssbo)(struct ac_shader_abi *abi, LLVMValueRef buffer, bool write); + + /** + * Load a descriptor associated to a sampler. + * + * \param descriptor_set the descriptor set index (only for Vulkan) + * \param base_index the base index of the sampler variable + * \param constant_index constant part of an array index (or 0, if the + * sampler variable is not an array) + * \param index non-constant part of an array index (may be NULL) + * \param desc_type the type of descriptor to load + * \param image whether the descriptor is loaded for an image operation + */ + LLVMValueRef (*load_sampler_desc)(struct ac_shader_abi *abi, unsigned descriptor_set, + unsigned base_index, unsigned constant_index, + LLVMValueRef index, enum ac_descriptor_type desc_type, + bool image, bool write, bool bindless); + + /** + * Load a Vulkan-specific resource. + * + * \param index resource index + * \param desc_set descriptor set + * \param binding descriptor set binding + */ + LLVMValueRef (*load_resource)(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set, + unsigned binding); + + LLVMValueRef (*load_sample_position)(struct ac_shader_abi *abi, LLVMValueRef sample_id); + + LLVMValueRef (*load_local_group_size)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_sample_mask_in)(struct ac_shader_abi *abi); + + LLVMValueRef (*load_base_vertex)(struct ac_shader_abi *abi); + + LLVMValueRef (*emit_fbfetch)(struct ac_shader_abi *abi); + + /* Whether to clamp the shadow reference value to [0,1]on GFX8. Radeonsi currently + * uses it due to promoting D16 to D32, but radv needs it off. */ + bool clamp_shadow_reference; + bool interp_at_sample_force_center; + + /* Whether bounds checks are required */ + bool robust_buffer_access; + + /* Check for Inf interpolation coeff */ + bool kill_ps_if_inf_interp; + + /* Whether undef values must be converted to zero */ + bool convert_undef_to_zero; + + /* Clamp div by 0 (so it won't produce NaN) */ + bool clamp_div_by_zero; }; #endif /* AC_SHADER_ABI_H */