X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.c;h=de427789aae8aa5dc3df534d112e5cd75040714a;hb=1fabb297177069e95ec1bb7053acb32f8ec3e092;hp=dcbcfbca0591c388641135ffb84e2af30d0e777f;hpb=96d60dd9ee6a394671debe60b96db703d9ecd674;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index dcbcfbca059..de427789aae 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -40,6 +40,7 @@ #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_dump.h" +#include "ac_llvm_util.h" #include "si_shader_internal.h" #include "si_pipe.h" #include "sid.h" @@ -54,8 +55,9 @@ static const char *scratch_rsrc_dword1_symbol = struct si_shader_output_values { LLVMValueRef values[4]; - unsigned name; - unsigned sid; + unsigned semantic_name; + unsigned semantic_index; + ubyte vertex_stream[4]; }; static void si_init_shader_ctx(struct si_shader_context *ctx, @@ -67,9 +69,20 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); -static void si_dump_shader_key(unsigned shader, union si_shader_key *key, +static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, FILE *f); +static void si_build_vs_prolog_function(struct si_shader_context *ctx, + union si_shader_part_key *key); +static void si_build_vs_epilog_function(struct si_shader_context *ctx, + union si_shader_part_key *key); +static void si_build_tcs_epilog_function(struct si_shader_context *ctx, + union si_shader_part_key *key); +static void si_build_ps_prolog_function(struct si_shader_context *ctx, + union si_shader_part_key *key); +static void si_build_ps_epilog_function(struct si_shader_context *ctx, + union si_shader_part_key *key); + /* Ideally pass the sample mask input to the PS epilog as v13, which * is its usual location, so that the shader doesn't have to add v_mov. */ @@ -85,14 +98,6 @@ enum { LOCAL_ADDR_SPACE = 3, }; -#define SENDMSG_GS 2 -#define SENDMSG_GS_DONE 3 - -#define SENDMSG_GS_OP_NOP (0 << 4) -#define SENDMSG_GS_OP_CUT (1 << 4) -#define SENDMSG_GS_OP_EMIT (2 << 4) -#define SENDMSG_GS_OP_EMIT_CUT (3 << 4) - /** * Returns a unique index for a semantic name and index. The index must be * less than 64, so that a 64-bit bitmask of used inputs or outputs can be @@ -111,11 +116,9 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) case TGSI_SEMANTIC_GENERIC: if (index <= 63-4) return 4 + index; - else - /* same explanation as in the default statement, - * the only user hitting this is st/nine. - */ - return 0; + + assert(!"invalid generic index"); + return 0; /* patch indices are completely separate and thus start from 0 */ case TGSI_SEMANTIC_TESSOUTER: @@ -126,11 +129,29 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) return 2 + index; default: - /* Don't fail here. The result of this function is only used - * for LS, TCS, TES, and GS, where legacy GL semantics can't - * occur, but this function is called for all vertex shaders - * before it's known whether LS will be compiled or not. - */ + assert(!"invalid semantic name"); + return 0; + } +} + +unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index) +{ + switch (name) { + case TGSI_SEMANTIC_FOG: + return 0; + case TGSI_SEMANTIC_LAYER: + return 1; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + return 2; + case TGSI_SEMANTIC_PRIMID: + return 3; + case TGSI_SEMANTIC_COLOR: /* these alias */ + case TGSI_SEMANTIC_BCOLOR: + return 4 + index; + case TGSI_SEMANTIC_TEXCOORD: + return 6 + index; + default: + assert(!"invalid semantic name"); return 0; } } @@ -147,7 +168,7 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx, param); if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) - value = bitcast(&ctx->soa.bld_base, + value = bitcast(&ctx->bld_base, TGSI_TYPE_UNSIGNED, value); if (rshift) @@ -222,7 +243,7 @@ get_tcs_out_patch_stride(struct si_shader_context *ctx) static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx) { - return lp_build_mul_imm(&ctx->soa.bld_base.uint_bld, + return lp_build_mul_imm(&ctx->bld_base.uint_bld, unpack_param(ctx, SI_PARAM_TCS_OUT_OFFSETS, 0, 16), @@ -232,7 +253,7 @@ get_tcs_out_patch0_offset(struct si_shader_context *ctx) static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) { - return lp_build_mul_imm(&ctx->soa.bld_base.uint_bld, + return lp_build_mul_imm(&ctx->bld_base.uint_bld, unpack_param(ctx, SI_PARAM_TCS_OUT_OFFSETS, 16, 16), @@ -278,71 +299,13 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) ""); } -static LLVMValueRef build_gep0(struct si_shader_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) -{ - LLVMValueRef indices[2] = { - LLVMConstInt(ctx->i32, 0, 0), - index, - }; - return LLVMBuildGEP(ctx->gallivm.builder, base_ptr, - indices, 2, ""); -} - -static void build_indexed_store(struct si_shader_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - LLVMValueRef value) -{ - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; - struct gallivm_state *gallivm = bld_base->base.gallivm; - - LLVMBuildStore(gallivm->builder, value, - build_gep0(ctx, base_ptr, index)); -} - -/** - * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. - * It's equivalent to doing a load from &base_ptr[index]. - * - * \param base_ptr Where the array starts. - * \param index The element index into the array. - * \param uniform Whether the base_ptr and index can be assumed to be - * dynamically uniform - */ -static LLVMValueRef build_indexed_load(struct si_shader_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index, - bool uniform) -{ - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMValueRef pointer; - - pointer = build_gep0(ctx, base_ptr, index); - if (uniform) - LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); - return LLVMBuildLoad(gallivm->builder, pointer, ""); -} - -/** - * Do a load from &base_ptr[index], but also add a flag that it's loading - * a constant from a dynamically uniform index. - */ -static LLVMValueRef build_indexed_load_const( - struct si_shader_context *ctx, - LLVMValueRef base_ptr, LLVMValueRef index) -{ - LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true); - LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); - return result; -} - static LLVMValueRef get_instance_index_for_fetch( struct si_shader_context *radeon_bld, unsigned param_start_instance, unsigned divisor) { struct si_shader_context *ctx = - si_shader_context(&radeon_bld->soa.bld_base); - struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm; + si_shader_context(&radeon_bld->bld_base); + struct gallivm_state *gallivm = radeon_bld->bld_base.base.gallivm; LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn, ctx->param_instance_id); @@ -356,70 +319,229 @@ static LLVMValueRef get_instance_index_for_fetch( LLVMGetParam(radeon_bld->main_fn, param_start_instance), ""); } +/* Bitcast <4 x float> to <2 x double>, extract the component, and convert + * to float. */ +static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, + LLVMValueRef vec4, + unsigned double_index) +{ + LLVMBuilderRef builder = ctx->gallivm.builder; + LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context); + LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, + LLVMVectorType(f64, 2), ""); + LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0); + LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, ""); + return LLVMBuildFPTrunc(builder, value, ctx->f32, ""); +} + static void declare_input_vs( - struct si_shader_context *radeon_bld, + struct si_shader_context *ctx, unsigned input_index, const struct tgsi_full_declaration *decl, LLVMValueRef out[4]) { - struct lp_build_context *base = &radeon_bld->soa.bld_base.base; + struct lp_build_context *base = &ctx->bld_base.base; struct gallivm_state *gallivm = base->gallivm; - struct si_shader_context *ctx = - si_shader_context(&radeon_bld->soa.bld_base); - unsigned divisor = - ctx->shader->key.vs.prolog.instance_divisors[input_index]; unsigned chan; + unsigned fix_fetch; + unsigned num_fetches; + unsigned fetch_stride; LLVMValueRef t_list_ptr; LLVMValueRef t_offset; LLVMValueRef t_list; - LLVMValueRef attribute_offset; - LLVMValueRef buffer_index; + LLVMValueRef vertex_index; LLVMValueRef args[3]; - LLVMValueRef input; + LLVMValueRef input[3]; /* Load the T list */ t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS); t_offset = lp_build_const_int32(gallivm, input_index); - t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset); - - /* Build the attribute offset */ - attribute_offset = lp_build_const_int32(gallivm, 0); - - if (!ctx->no_prolog) { - buffer_index = LLVMGetParam(radeon_bld->main_fn, - ctx->param_vertex_index0 + - input_index); - } else if (divisor) { - /* Build index from instance ID, start instance and divisor */ - ctx->shader->info.uses_instanceid = true; - buffer_index = get_instance_index_for_fetch(ctx, - SI_PARAM_START_INSTANCE, - divisor); - } else { - /* Load the buffer index for vertices. */ - LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, - ctx->param_vertex_id); - LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn, - SI_PARAM_BASE_VERTEX); - buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, ""); + t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset); + + vertex_index = LLVMGetParam(ctx->main_fn, + ctx->param_vertex_index0 + + input_index); + + fix_fetch = ctx->shader->key.mono.vs.fix_fetch[input_index]; + + /* Do multiple loads for special formats. */ + switch (fix_fetch) { + case SI_FIX_FETCH_RGB_64_FLOAT: + num_fetches = 3; /* 3 2-dword loads */ + fetch_stride = 8; + break; + case SI_FIX_FETCH_RGBA_64_FLOAT: + num_fetches = 2; /* 2 4-dword loads */ + fetch_stride = 16; + break; + case SI_FIX_FETCH_RGB_8: + case SI_FIX_FETCH_RGB_8_INT: + num_fetches = 3; + fetch_stride = 1; + break; + case SI_FIX_FETCH_RGB_16: + case SI_FIX_FETCH_RGB_16_INT: + num_fetches = 3; + fetch_stride = 2; + break; + default: + num_fetches = 1; + fetch_stride = 0; } args[0] = t_list; - args[1] = attribute_offset; - args[2] = buffer_index; - input = lp_build_intrinsic(gallivm->builder, - "llvm.SI.vs.load.input", ctx->v4f32, args, 3, - LLVMReadNoneAttribute); + args[2] = vertex_index; + + for (unsigned i = 0; i < num_fetches; i++) { + args[1] = LLVMConstInt(ctx->i32, fetch_stride * i, 0); + + input[i] = lp_build_intrinsic(gallivm->builder, + "llvm.SI.vs.load.input", ctx->v4f32, args, 3, + LP_FUNC_ATTR_READNONE); + } /* Break up the vec4 into individual components */ for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); out[chan] = LLVMBuildExtractElement(gallivm->builder, - input, llvm_chan, ""); + input[0], llvm_chan, ""); + } + + switch (fix_fetch) { + case SI_FIX_FETCH_A2_SNORM: + case SI_FIX_FETCH_A2_SSCALED: + case SI_FIX_FETCH_A2_SINT: { + /* The hardware returns an unsigned value; convert it to a + * signed one. + */ + LLVMValueRef tmp = out[3]; + LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); + + /* First, recover the sign-extended signed integer value. */ + if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) + tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, ""); + else + tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, ""); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + tmp = LLVMBuildShl(gallivm->builder, tmp, + fix_fetch == SI_FIX_FETCH_A2_SNORM ? + LLVMConstInt(ctx->i32, 7, 0) : c30, ""); + tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, ""); + + /* Convert back to the right type. */ + if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { + LLVMValueRef clamp; + LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); + tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, ""); + clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, ""); + } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { + tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, ""); + } + + out[3] = tmp; + break; + } + case SI_FIX_FETCH_RGBA_32_UNORM: + case SI_FIX_FETCH_RGBX_32_UNORM: + for (chan = 0; chan < 4; chan++) { + out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], + ctx->i32, ""); + out[chan] = LLVMBuildUIToFP(gallivm->builder, + out[chan], ctx->f32, ""); + out[chan] = LLVMBuildFMul(gallivm->builder, out[chan], + LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); + } + /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ + if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM) + out[3] = LLVMConstReal(ctx->f32, 1); + break; + case SI_FIX_FETCH_RGBA_32_SNORM: + case SI_FIX_FETCH_RGBX_32_SNORM: + case SI_FIX_FETCH_RGBA_32_FIXED: + case SI_FIX_FETCH_RGBX_32_FIXED: { + double scale; + if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED) + scale = 1.0 / 0x10000; + else + scale = 1.0 / INT_MAX; + + for (chan = 0; chan < 4; chan++) { + out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], + ctx->i32, ""); + out[chan] = LLVMBuildSIToFP(gallivm->builder, + out[chan], ctx->f32, ""); + out[chan] = LLVMBuildFMul(gallivm->builder, out[chan], + LLVMConstReal(ctx->f32, scale), ""); + } + /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ + if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM || + fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED) + out[3] = LLVMConstReal(ctx->f32, 1); + break; + } + case SI_FIX_FETCH_RGBA_32_USCALED: + for (chan = 0; chan < 4; chan++) { + out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], + ctx->i32, ""); + out[chan] = LLVMBuildUIToFP(gallivm->builder, + out[chan], ctx->f32, ""); + } + break; + case SI_FIX_FETCH_RGBA_32_SSCALED: + for (chan = 0; chan < 4; chan++) { + out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], + ctx->i32, ""); + out[chan] = LLVMBuildSIToFP(gallivm->builder, + out[chan], ctx->f32, ""); + } + break; + case SI_FIX_FETCH_RG_64_FLOAT: + for (chan = 0; chan < 2; chan++) + out[chan] = extract_double_to_float(ctx, input[0], chan); + + out[2] = LLVMConstReal(ctx->f32, 0); + out[3] = LLVMConstReal(ctx->f32, 1); + break; + case SI_FIX_FETCH_RGB_64_FLOAT: + for (chan = 0; chan < 3; chan++) + out[chan] = extract_double_to_float(ctx, input[chan], 0); + + out[3] = LLVMConstReal(ctx->f32, 1); + break; + case SI_FIX_FETCH_RGBA_64_FLOAT: + for (chan = 0; chan < 4; chan++) { + out[chan] = extract_double_to_float(ctx, input[chan / 2], + chan % 2); + } + break; + case SI_FIX_FETCH_RGB_8: + case SI_FIX_FETCH_RGB_8_INT: + case SI_FIX_FETCH_RGB_16: + case SI_FIX_FETCH_RGB_16_INT: + for (chan = 0; chan < 3; chan++) { + out[chan] = LLVMBuildExtractElement(gallivm->builder, + input[chan], + ctx->i32_0, ""); + } + if (fix_fetch == SI_FIX_FETCH_RGB_8 || + fix_fetch == SI_FIX_FETCH_RGB_16) { + out[3] = LLVMConstReal(ctx->f32, 1); + } else { + out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1, + ctx->f32, ""); + } + break; } } @@ -458,10 +580,10 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx, const struct tgsi_ind_register *ind, int rel_index) { - struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; LLVMValueRef result; - result = ctx->soa.addr[ind->Index][ind->Swizzle]; + result = ctx->addrs[ind->Index][ind->Swizzle]; result = LLVMBuildLoad(gallivm->builder, result, ""); result = LLVMBuildAdd(gallivm->builder, result, lp_build_const_int32(gallivm, rel_index), ""); @@ -498,7 +620,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride, LLVMValueRef base_addr) { - struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; struct tgsi_shader_info *info = &ctx->shader->selector->info; ubyte *name, *index, *array_first; int first, param; @@ -596,7 +718,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, LLVMValueRef vertex_index, LLVMValueRef param_index) { - struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; LLVMValueRef param_stride, constant16; @@ -640,7 +762,7 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( const struct tgsi_full_dst_register *dst, const struct tgsi_full_src_register *src) { - struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; struct tgsi_shader_info *info = &ctx->shader->selector->info; ubyte *name, *index, *array_first; struct tgsi_full_src_register reg; @@ -698,151 +820,6 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( return get_tcs_tes_buffer_address(ctx, vertex_index, param_index); } -/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. - * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), - * or v4i32 (num_channels=3,4). */ -static void build_tbuffer_store(struct si_shader_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef vaddr, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned dfmt, - unsigned nfmt, - unsigned offen, - unsigned idxen, - unsigned glc, - unsigned slc, - unsigned tfe) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef args[] = { - rsrc, - vdata, - LLVMConstInt(ctx->i32, num_channels, 0), - vaddr, - soffset, - LLVMConstInt(ctx->i32, inst_offset, 0), - LLVMConstInt(ctx->i32, dfmt, 0), - LLVMConstInt(ctx->i32, nfmt, 0), - LLVMConstInt(ctx->i32, offen, 0), - LLVMConstInt(ctx->i32, idxen, 0), - LLVMConstInt(ctx->i32, glc, 0), - LLVMConstInt(ctx->i32, slc, 0), - LLVMConstInt(ctx->i32, tfe, 0) - }; - - /* The instruction offset field has 12 bits */ - assert(offen || inst_offset < (1 << 12)); - - /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */ - unsigned func = CLAMP(num_channels, 1, 3) - 1; - const char *types[] = {"i32", "v2i32", "v4i32"}; - char name[256]; - snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]); - - lp_build_intrinsic(gallivm->builder, name, ctx->voidt, - args, ARRAY_SIZE(args), 0); -} - -static void build_tbuffer_store_dwords(struct si_shader_context *ctx, - LLVMValueRef rsrc, - LLVMValueRef vdata, - unsigned num_channels, - LLVMValueRef vaddr, - LLVMValueRef soffset, - unsigned inst_offset) -{ - static unsigned dfmt[] = { - V_008F0C_BUF_DATA_FORMAT_32, - V_008F0C_BUF_DATA_FORMAT_32_32, - V_008F0C_BUF_DATA_FORMAT_32_32_32, - V_008F0C_BUF_DATA_FORMAT_32_32_32_32 - }; - assert(num_channels >= 1 && num_channels <= 4); - - build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset, - inst_offset, dfmt[num_channels-1], - V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0); -} - -static LLVMValueRef build_buffer_load(struct si_shader_context *ctx, - LLVMValueRef rsrc, - int num_channels, - LLVMValueRef vindex, - LLVMValueRef voffset, - LLVMValueRef soffset, - unsigned inst_offset, - unsigned glc, - unsigned slc) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - unsigned func = CLAMP(num_channels, 1, 3) - 1; - - if (HAVE_LLVM >= 0x309) { - LLVMValueRef args[] = { - LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""), - vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0), - LLVMConstInt(ctx->i32, inst_offset, 0), - LLVMConstInt(ctx->i1, glc, 0), - LLVMConstInt(ctx->i1, slc, 0) - }; - - LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2), - ctx->v4f32}; - const char *type_names[] = {"f32", "v2f32", "v4f32"}; - char name[256]; - - if (voffset) { - args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset, - ""); - } - - if (soffset) { - args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset, - ""); - } - - snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s", - type_names[func]); - - return lp_build_intrinsic(gallivm->builder, name, types[func], args, - ARRAY_SIZE(args), LLVMReadOnlyAttribute); - } else { - LLVMValueRef args[] = { - LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""), - voffset ? voffset : vindex, - soffset, - LLVMConstInt(ctx->i32, inst_offset, 0), - LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen - LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen - LLVMConstInt(ctx->i32, glc, 0), - LLVMConstInt(ctx->i32, slc, 0), - LLVMConstInt(ctx->i32, 0, 0), // TFE - }; - - LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2), - ctx->v4i32}; - const char *type_names[] = {"i32", "v2i32", "v4i32"}; - const char *arg_type = "i32"; - char name[256]; - - if (voffset && vindex) { - LLVMValueRef vaddr[] = {vindex, voffset}; - - arg_type = "v2i32"; - args[1] = lp_build_gather_values(gallivm, vaddr, 2); - } - - snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s", - type_names[func], arg_type); - - return lp_build_intrinsic(gallivm->builder, name, types[func], args, - ARRAY_SIZE(args), LLVMReadOnlyAttribute); - } -} - static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type, unsigned swizzle, LLVMValueRef buffer, LLVMValueRef offset, @@ -855,25 +832,25 @@ static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4); if (swizzle == ~0) { - value = build_buffer_load(ctx, buffer, 4, NULL, base, offset, - 0, 1, 0); + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, + 0, 1, 0); return LLVMBuildBitCast(gallivm->builder, value, vec_type, ""); } if (!tgsi_type_is_64bit(type)) { - value = build_buffer_load(ctx, buffer, 4, NULL, base, offset, - 0, 1, 0); + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, + 0, 1, 0); value = LLVMBuildBitCast(gallivm->builder, value, vec_type, ""); return LLVMBuildExtractElement(gallivm->builder, value, lp_build_const_int32(gallivm, swizzle), ""); } - value = build_buffer_load(ctx, buffer, 1, NULL, base, offset, + value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, 1, 0); - value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset, + value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, 1, 0); return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); @@ -907,12 +884,12 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, lp_build_const_int32(gallivm, swizzle)); - value = build_indexed_load(ctx, ctx->lds, dw_addr, false); + value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false); if (tgsi_type_is_64bit(type)) { LLVMValueRef value2; dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, lp_build_const_int32(gallivm, 1)); - value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false); + value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false); return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); } @@ -938,8 +915,8 @@ static void lds_store(struct lp_build_tgsi_context *bld_base, lp_build_const_int32(gallivm, swizzle)); value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); - build_indexed_store(ctx, ctx->lds, - dw_addr, value); + ac_build_indexed_store(&ctx->ac, ctx->lds, + dw_addr, value); } static LLVMValueRef fetch_input_tcs( @@ -988,7 +965,7 @@ static LLVMValueRef fetch_input_tes( rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buffer = build_indexed_load_const(ctx, rw_buffers, + buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); @@ -1030,7 +1007,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buffer = build_indexed_load_const(ctx, rw_buffers, + buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); @@ -1049,17 +1026,17 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, values[chan_index] = value; if (inst->Dst[0].Register.WriteMask != 0xF) { - build_tbuffer_store_dwords(ctx, buffer, value, 1, - buf_addr, base, - 4 * chan_index); + ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 1, + buf_addr, base, + 4 * chan_index); } } if (inst->Dst[0].Register.WriteMask == 0xF) { LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm, values, 4); - build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr, - base, 0); + ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 4, buf_addr, + base, 0); } } @@ -1072,7 +1049,7 @@ static LLVMValueRef fetch_input_gs( struct lp_build_context *base = &bld_base->base; struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader *shader = ctx->shader; - struct lp_build_context *uint = &ctx->soa.bld_base.uint_bld; + struct lp_build_context *uint = &ctx->bld_base.uint_bld; struct gallivm_state *gallivm = base->gallivm; LLVMValueRef vtx_offset; LLVMValueRef args[9]; @@ -1126,14 +1103,14 @@ static LLVMValueRef fetch_input_gs( value = lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", ctx->i32, args, 9, - LLVMReadOnlyAttribute); + LP_FUNC_ATTR_READONLY); if (tgsi_type_is_64bit(type)) { LLVMValueRef value2; args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256); value2 = lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", ctx->i32, args, 9, - LLVMReadOnlyAttribute); + LP_FUNC_ATTR_READONLY); return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); } @@ -1171,45 +1148,6 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location) } } -/* This shouldn't be used by explicit INTERP opcodes. */ -static unsigned select_interp_param(struct si_shader_context *ctx, - unsigned param) -{ - if (!ctx->no_prolog) - return param; - - if (ctx->shader->key.ps.prolog.force_persp_sample_interp) { - switch (param) { - case SI_PARAM_PERSP_CENTROID: - case SI_PARAM_PERSP_CENTER: - return SI_PARAM_PERSP_SAMPLE; - } - } - if (ctx->shader->key.ps.prolog.force_linear_sample_interp) { - switch (param) { - case SI_PARAM_LINEAR_CENTROID: - case SI_PARAM_LINEAR_CENTER: - return SI_PARAM_LINEAR_SAMPLE; - } - } - if (ctx->shader->key.ps.prolog.force_persp_center_interp) { - switch (param) { - case SI_PARAM_PERSP_CENTROID: - case SI_PARAM_PERSP_SAMPLE: - return SI_PARAM_PERSP_CENTER; - } - } - if (ctx->shader->key.ps.prolog.force_linear_center_interp) { - switch (param) { - case SI_PARAM_LINEAR_CENTROID: - case SI_PARAM_LINEAR_SAMPLE: - return SI_PARAM_LINEAR_CENTER; - } - } - - return param; -} - /** * Interpolate a fragment shader input. * @@ -1235,16 +1173,15 @@ static void interp_fs_input(struct si_shader_context *ctx, LLVMValueRef face, LLVMValueRef result[4]) { - struct lp_build_context *base = &ctx->soa.bld_base.base; - struct lp_build_context *uint = &ctx->soa.bld_base.uint_bld; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = &bld_base->uint_bld; struct gallivm_state *gallivm = base->gallivm; - const char *intr_name; LLVMValueRef attr_number; + LLVMValueRef i, j; unsigned chan; - attr_number = lp_build_const_int32(gallivm, input_index); - /* fs.constant returns the param from the middle vertex, so it's not * really useful for flat shading. It's meant to be used for custom * interpolation (but the intrinsic can't fetch from the other two @@ -1254,12 +1191,26 @@ static void interp_fs_input(struct si_shader_context *ctx, * to do the right thing. The only reason we use fs.constant is that * fs.interp cannot be used on integers, because they can be equal * to NaN. + * + * When interp is false we will use fs.constant or for newer llvm, + * amdgcn.interp.mov. */ - intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; + bool interp = interp_param != NULL; + + attr_number = lp_build_const_int32(gallivm, input_index); + + if (interp) { + interp_param = LLVMBuildBitCast(gallivm->builder, interp_param, + LLVMVectorType(ctx->f32, 2), ""); + + i = LLVMBuildExtractElement(gallivm->builder, interp_param, + uint->zero, ""); + j = LLVMBuildExtractElement(gallivm->builder, interp_param, + uint->one, ""); + } if (semantic_name == TGSI_SEMANTIC_COLOR && - ctx->shader->key.ps.prolog.color_two_side) { - LLVMValueRef args[4]; + ctx->shader->key.part.ps.prolog.color_two_side) { LLVMValueRef is_face_positive; LLVMValueRef back_attr_number; @@ -1275,22 +1226,25 @@ static void interp_fs_input(struct si_shader_context *ctx, is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE, face, uint->zero, ""); - args[2] = prim_mask; - args[3] = interp_param; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); LLVMValueRef front, back; - args[0] = llvm_chan; - args[1] = attr_number; - front = lp_build_intrinsic(gallivm->builder, intr_name, - ctx->f32, args, args[3] ? 4 : 3, - LLVMReadNoneAttribute); - - args[1] = back_attr_number; - back = lp_build_intrinsic(gallivm->builder, intr_name, - ctx->f32, args, args[3] ? 4 : 3, - LLVMReadNoneAttribute); + if (interp) { + front = ac_build_fs_interp(&ctx->ac, llvm_chan, + attr_number, prim_mask, + i, j); + back = ac_build_fs_interp(&ctx->ac, llvm_chan, + back_attr_number, prim_mask, + i, j); + } else { + front = ac_build_fs_interp_mov(&ctx->ac, + lp_build_const_int32(gallivm, 2), /* P0 */ + llvm_chan, attr_number, prim_mask); + back = ac_build_fs_interp_mov(&ctx->ac, + lp_build_const_int32(gallivm, 2), /* P0 */ + llvm_chan, back_attr_number, prim_mask); + } result[chan] = LLVMBuildSelect(gallivm->builder, is_face_positive, @@ -1299,82 +1253,31 @@ static void interp_fs_input(struct si_shader_context *ctx, ""); } } else if (semantic_name == TGSI_SEMANTIC_FOG) { - LLVMValueRef args[4]; - - args[0] = uint->zero; - args[1] = attr_number; - args[2] = prim_mask; - args[3] = interp_param; - result[0] = lp_build_intrinsic(gallivm->builder, intr_name, - ctx->f32, args, args[3] ? 4 : 3, - LLVMReadNoneAttribute); + if (interp) { + result[0] = ac_build_fs_interp(&ctx->ac, uint->zero, + attr_number, prim_mask, i, j); + } else { + result[0] = ac_build_fs_interp_mov(&ctx->ac, uint->zero, + lp_build_const_int32(gallivm, 2), /* P0 */ + attr_number, prim_mask); + } result[1] = result[2] = lp_build_const_float(gallivm, 0.0f); result[3] = lp_build_const_float(gallivm, 1.0f); } else { for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef args[4]; LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); - args[0] = llvm_chan; - args[1] = attr_number; - args[2] = prim_mask; - args[3] = interp_param; - result[chan] = lp_build_intrinsic(gallivm->builder, intr_name, - ctx->f32, args, args[3] ? 4 : 3, - LLVMReadNoneAttribute); - } - } -} - -/* LLVMGetParam with bc_optimize resolved. */ -static LLVMValueRef get_interp_param(struct si_shader_context *ctx, - int interp_param_idx) -{ - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMValueRef main_fn = ctx->main_fn; - LLVMValueRef param = NULL; - - /* Handle PRIM_MASK[31] (bc_optimize). */ - if (ctx->no_prolog && - ((ctx->shader->key.ps.prolog.bc_optimize_for_persp && - interp_param_idx == SI_PARAM_PERSP_CENTROID) || - (ctx->shader->key.ps.prolog.bc_optimize_for_linear && - interp_param_idx == SI_PARAM_LINEAR_CENTROID))) { - /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; - * The hw doesn't compute CENTROID if the whole wave only - * contains fully-covered quads. - */ - LLVMValueRef bc_optimize = - LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK); - bc_optimize = LLVMBuildLShr(builder, - bc_optimize, - LLVMConstInt(ctx->i32, 31, 0), ""); - bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, ""); - - if (ctx->shader->key.ps.prolog.bc_optimize_for_persp && - interp_param_idx == SI_PARAM_PERSP_CENTROID) { - param = LLVMBuildSelect(builder, bc_optimize, - LLVMGetParam(main_fn, - SI_PARAM_PERSP_CENTER), - LLVMGetParam(main_fn, - SI_PARAM_PERSP_CENTROID), - ""); - } - if (ctx->shader->key.ps.prolog.bc_optimize_for_linear && - interp_param_idx == SI_PARAM_LINEAR_CENTROID) { - param = LLVMBuildSelect(builder, bc_optimize, - LLVMGetParam(main_fn, - SI_PARAM_LINEAR_CENTER), - LLVMGetParam(main_fn, - SI_PARAM_LINEAR_CENTROID), - ""); + if (interp) { + result[chan] = ac_build_fs_interp(&ctx->ac, + llvm_chan, attr_number, prim_mask, i, j); + } else { + result[chan] = ac_build_fs_interp_mov(&ctx->ac, + lp_build_const_int32(gallivm, 2), /* P0 */ + llvm_chan, attr_number, prim_mask); + } } } - - if (!param) - param = LLVMGetParam(main_fn, interp_param_idx); - return param; } static void declare_input_fs( @@ -1383,17 +1286,16 @@ static void declare_input_fs( const struct tgsi_full_declaration *decl, LLVMValueRef out[4]) { - struct lp_build_context *base = &radeon_bld->soa.bld_base.base; + struct lp_build_context *base = &radeon_bld->bld_base.base; struct si_shader_context *ctx = - si_shader_context(&radeon_bld->soa.bld_base); + si_shader_context(&radeon_bld->bld_base); struct si_shader *shader = ctx->shader; LLVMValueRef main_fn = radeon_bld->main_fn; LLVMValueRef interp_param = NULL; int interp_param_idx; /* Get colors from input VGPRs (set by the prolog). */ - if (!ctx->no_prolog && - decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { + if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { unsigned i = decl->Semantic.Index; unsigned colors_read = shader->selector->info.colors_read; unsigned mask = colors_read >> (i * 4); @@ -1412,14 +1314,12 @@ static void declare_input_fs( if (interp_param_idx == -1) return; else if (interp_param_idx) { - interp_param_idx = select_interp_param(ctx, - interp_param_idx); - interp_param = get_interp_param(ctx, interp_param_idx); + interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); } if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR && decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR && - ctx->shader->key.ps.prolog.flatshade_colors) + ctx->shader->key.part.ps.prolog.flatshade_colors) interp_param = NULL; /* load the constant color */ interp_fs_input(ctx, input_index, decl->Semantic.Name, @@ -1432,53 +1332,10 @@ static void declare_input_fs( static LLVMValueRef get_sample_id(struct si_shader_context *radeon_bld) { - return unpack_param(si_shader_context(&radeon_bld->soa.bld_base), + return unpack_param(si_shader_context(&radeon_bld->bld_base), SI_PARAM_ANCILLARY, 8, 4); } -/** - * Set range metadata on an instruction. This can only be used on load and - * call instructions. If you know an instruction can only produce the values - * 0, 1, 2, you would do set_range_metadata(value, 0, 3); - * \p lo is the minimum value inclusive. - * \p hi is the maximum value exclusive. - */ -static void set_range_metadata(struct si_shader_context *ctx, - LLVMValueRef value, unsigned lo, unsigned hi) -{ - LLVMValueRef range_md, md_args[2]; - LLVMTypeRef type = LLVMTypeOf(value); - LLVMContextRef context = LLVMGetTypeContext(type); - - md_args[0] = LLVMConstInt(type, lo, false); - md_args[1] = LLVMConstInt(type, hi, false); - range_md = LLVMMDNodeInContext(context, md_args, 2); - LLVMSetMetadata(value, ctx->range_md_kind, range_md); -} - -static LLVMValueRef get_thread_id(struct si_shader_context *ctx) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef tid; - - if (HAVE_LLVM < 0x0308) { - tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", - ctx->i32, NULL, 0, LLVMReadNoneAttribute); - } else { - LLVMValueRef tid_args[2]; - tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff); - tid_args[1] = lp_build_const_int32(gallivm, 0); - tid_args[1] = lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.mbcnt.lo", ctx->i32, - tid_args, 2, LLVMReadNoneAttribute); - - tid = lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.mbcnt.hi", ctx->i32, - tid_args, 2, LLVMReadNoneAttribute); - } - set_range_metadata(ctx, tid, 0, 64); - return tid; -} /** * Load a dword from a constant buffer. @@ -1491,19 +1348,19 @@ static LLVMValueRef buffer_load_const(struct si_shader_context *ctx, LLVMValueRef args[2] = {resource, offset}; return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); } static LLVMValueRef load_sample_position(struct si_shader_context *radeon_bld, LLVMValueRef sample_id) { struct si_shader_context *ctx = - si_shader_context(&radeon_bld->soa.bld_base); - struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld; + si_shader_context(&radeon_bld->bld_base); + struct lp_build_context *uint_bld = &radeon_bld->bld_base.uint_bld; struct gallivm_state *gallivm = &radeon_bld->gallivm; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef desc = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS); - LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index); + LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index); /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8); @@ -1525,8 +1382,8 @@ static void declare_system_value( const struct tgsi_full_declaration *decl) { struct si_shader_context *ctx = - si_shader_context(&radeon_bld->soa.bld_base); - struct lp_build_context *bld = &radeon_bld->soa.bld_base.base; + si_shader_context(&radeon_bld->bld_base); + struct lp_build_context *bld = &radeon_bld->bld_base.base; struct gallivm_state *gallivm = &radeon_bld->gallivm; LLVMValueRef value = 0; @@ -1580,7 +1437,7 @@ static void declare_system_value( LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT), LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT), LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT), - lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP, + lp_build_emit_llvm_unary(&radeon_bld->bld_base, TGSI_OPCODE_RCP, LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_W_FLOAT)), }; @@ -1603,9 +1460,9 @@ static void declare_system_value( lp_build_const_float(gallivm, 0), lp_build_const_float(gallivm, 0) }; - pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, + pos[0] = lp_build_emit_llvm_unary(&radeon_bld->bld_base, TGSI_OPCODE_FRC, pos[0]); - pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, + pos[1] = lp_build_emit_llvm_unary(&radeon_bld->bld_base, TGSI_OPCODE_FRC, pos[1]); value = lp_build_gather_values(gallivm, pos, 4); break; @@ -1654,14 +1511,14 @@ static void declare_system_value( rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buffer = build_indexed_load_const(ctx, rw_buffers, + buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); addr = get_tcs_tes_buffer_address(ctx, NULL, lp_build_const_int32(gallivm, param)); - value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT, + value = buffer_load(&radeon_bld->bld_base, TGSI_TYPE_FLOAT, ~0, buffer, base, addr); break; @@ -1675,7 +1532,7 @@ static void declare_system_value( slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS); buf = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buf = build_indexed_load_const(ctx, buf, slot); + buf = ac_build_indexed_load_const(&ctx->ac, buf, slot); offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0; for (i = 0; i < 4; i++) @@ -1686,7 +1543,7 @@ static void declare_system_value( } case TGSI_SEMANTIC_PRIMID: - value = get_primitive_id(&radeon_bld->soa.bld_base, 0); + value = get_primitive_id(&radeon_bld->bld_base, 0); break; case TGSI_SEMANTIC_GRID_SIZE: @@ -1724,16 +1581,19 @@ static void declare_system_value( value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID); break; -#if HAVE_LLVM >= 0x0309 case TGSI_SEMANTIC_HELPER_INVOCATION: - value = lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.ps.live", - ctx->i1, NULL, 0, - LLVMReadNoneAttribute); - value = LLVMBuildNot(gallivm->builder, value, ""); - value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, ""); + if (HAVE_LLVM >= 0x0309) { + value = lp_build_intrinsic(gallivm->builder, + "llvm.amdgcn.ps.live", + ctx->i1, NULL, 0, + LP_FUNC_ATTR_READNONE); + value = LLVMBuildNot(gallivm->builder, value, ""); + value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, ""); + } else { + assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported"); + return; + } break; -#endif default: assert(!"unknown system value"); @@ -1747,7 +1607,7 @@ static void declare_compute_memory(struct si_shader_context *radeon_bld, const struct tgsi_full_declaration *decl) { struct si_shader_context *ctx = - si_shader_context(&radeon_bld->soa.bld_base); + si_shader_context(&radeon_bld->bld_base); struct si_shader_selector *sel = ctx->shader->selector; struct gallivm_state *gallivm = &radeon_bld->gallivm; @@ -1772,7 +1632,7 @@ static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_CONST_BUFFERS); - return build_indexed_load_const(ctx, list_ptr, + return ac_build_indexed_load_const(&ctx->ac, list_ptr, LLVMConstInt(ctx->i32, i, 0)); } @@ -1808,12 +1668,12 @@ static LLVMValueRef fetch_constant( index = get_bounded_indirect_index(ctx, ®->DimIndirect, reg->Dimension.Index, SI_NUM_CONST_BUFFERS); - bufp = build_indexed_load_const(ctx, ptr, index); + bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index); } else bufp = load_const_buffer_desc(ctx, buf); if (reg->Register.Indirect) { - addr = ctx->soa.addr[ireg->Index][ireg->Swizzle]; + addr = ctx->addrs[ireg->Index][ireg->Swizzle]; addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg"); addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16); addr = lp_build_add(&bld_base->uint_bld, addr, @@ -1868,8 +1728,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, LLVMValueRef *args) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct lp_build_context *uint = - &ctx->soa.bld_base.uint_bld; + struct lp_build_context *uint = &ctx->bld_base.uint_bld; struct lp_build_context *base = &bld_base->base; struct gallivm_state *gallivm = base->gallivm; LLVMBuilderRef builder = base->gallivm->builder; @@ -1891,13 +1750,13 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, args[3] = lp_build_const_int32(base->gallivm, target); if (ctx->type == PIPE_SHADER_FRAGMENT) { - const union si_shader_key *key = &ctx->shader->key; - unsigned col_formats = key->ps.epilog.spi_shader_col_format; + const struct si_shader_key *key = &ctx->shader->key; + unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; int cbuf = target - V_008DFC_SQ_EXP_MRT; assert(cbuf >= 0 && cbuf < 8); spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; - is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1; + is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; } args[4] = uint->zero; /* COMPR flag */ @@ -1942,7 +1801,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, packed = lp_build_intrinsic(base->gallivm->builder, "llvm.SI.packf16", ctx->i32, pack_args, 2, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); args[chan + 5] = LLVMBuildBitCast(base->gallivm->builder, packed, ctx->f32, ""); @@ -2050,13 +1909,13 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { + if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF); LLVMValueRef alpha_pass = lp_build_cmp(&bld_base->base, - ctx->shader->key.ps.epilog.alpha_func, + ctx->shader->key.part.ps.epilog.alpha_func, alpha, alpha_ref); LLVMValueRef arg = lp_build_select(&bld_base->base, @@ -2087,7 +1946,7 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context * coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", ctx->i32, - &coverage, 1, LLVMReadNoneAttribute); + &coverage, 1, LP_FUNC_ATTR_READNONE); coverage = LLVMBuildUIToFP(gallivm->builder, coverage, ctx->f32, ""); @@ -2104,7 +1963,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base, { struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *base = &bld_base->base; - struct lp_build_context *uint = &ctx->soa.bld_base.uint_bld; + struct lp_build_context *uint = &ctx->bld_base.uint_bld; unsigned reg_index; unsigned chan; unsigned const_chan; @@ -2112,7 +1971,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base, LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_VS_CONST_CLIP_PLANES); - LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index); + LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index); for (reg_index = 0; reg_index < 2; reg_index ++) { LLVMValueRef *args = pos[2 + reg_index]; @@ -2167,44 +2026,82 @@ static void si_dump_streamout(struct pipe_stream_output_info *so) } } -/* On SI, the vertex shader is responsible for writing streamout data - * to buffers. */ -static void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) +static void emit_streamout_output(struct si_shader_context *ctx, + LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out) { - struct pipe_stream_output_info *so = &ctx->shader->selector->so; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; - int i, j; - struct lp_build_if_state if_ctx; - LLVMValueRef so_buffers[4]; - LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); + unsigned buf_idx = stream_out->output_buffer; + unsigned start = stream_out->start_component; + unsigned num_comps = stream_out->num_components; + LLVMValueRef out[4]; - /* Load the descriptors. */ - for (i = 0; i < 4; ++i) { - if (ctx->shader->selector->so.stride[i]) { - LLVMValueRef offset = lp_build_const_int32(gallivm, - SI_VS_STREAMOUT_BUF0 + i); + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + /* Load the output as int. */ + for (int j = 0; j < num_comps; j++) { + assert(stream_out->stream == shader_out->vertex_stream[start + j]); + + out[j] = LLVMBuildBitCast(builder, + shader_out->values[start + j], + ctx->i32, ""); + } + + /* Pack the output. */ + LLVMValueRef vdata = NULL; - so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset); + switch (num_comps) { + case 1: /* as i32 */ + vdata = out[0]; + break; + case 2: /* as v2i32 */ + case 3: /* as v4i32 (aligned to 4) */ + case 4: /* as v4i32 */ + vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); + for (int j = 0; j < num_comps; j++) { + vdata = LLVMBuildInsertElement(builder, vdata, out[j], + LLVMConstInt(ctx->i32, j, 0), ""); } + break; } + ac_build_tbuffer_store_dwords(&ctx->ac, so_buffers[buf_idx], + vdata, num_comps, + so_write_offsets[buf_idx], + LLVMConstInt(ctx->i32, 0, 0), + stream_out->dst_offset * 4); +} + +/** + * Write streamout data to buffers for vertex stream @p stream (different + * vertex streams can occur for GS copy shaders). + */ +static void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream) +{ + struct si_shader_selector *sel = ctx->shader->selector; + struct pipe_stream_output_info *so = &sel->so; + struct gallivm_state *gallivm = &ctx->gallivm; + LLVMBuilderRef builder = gallivm->builder; + int i; + struct lp_build_if_state if_ctx; + /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ LLVMValueRef so_vtx_count = unpack_param(ctx, ctx->param_streamout_config, 16, 7); - LLVMValueRef tid = get_thread_id(ctx); + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); /* can_emit = tid < so_vtx_count; */ LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); - LLVMValueRef stream_id = - unpack_param(ctx, ctx->param_streamout_config, 24, 2); - /* Emit the streamout code conditionally. This actually avoids * out-of-bounds buffer access. The hw tells us via the SGPR * (so_vtx_count) which threads are allowed to emit streamout data. */ @@ -2223,12 +2120,22 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, /* Compute (streamout_write_index + thread_id). */ so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); - /* Compute the write offset for each enabled buffer. */ + /* Load the descriptor and compute the write offset for each + * enabled buffer. */ LLVMValueRef so_write_offset[4] = {}; + LLVMValueRef so_buffers[4]; + LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, + SI_PARAM_RW_BUFFERS); + for (i = 0; i < 4; i++) { if (!so->stride[i]) continue; + LLVMValueRef offset = lp_build_const_int32(gallivm, + SI_VS_STREAMOUT_BUF0 + i); + + so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, ctx->param_streamout_offset[i]); so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); @@ -2240,58 +2147,16 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, /* Write streamout data. */ for (i = 0; i < so->num_outputs; i++) { - unsigned buf_idx = so->output[i].output_buffer; unsigned reg = so->output[i].register_index; - unsigned start = so->output[i].start_component; - unsigned num_comps = so->output[i].num_components; - unsigned stream = so->output[i].stream; - LLVMValueRef out[4]; - struct lp_build_if_state if_ctx_stream; - - assert(num_comps && num_comps <= 4); - if (!num_comps || num_comps > 4) - continue; if (reg >= noutput) continue; - /* Load the output as int. */ - for (j = 0; j < num_comps; j++) { - out[j] = LLVMBuildBitCast(builder, - outputs[reg].values[start+j], - ctx->i32, ""); - } - - /* Pack the output. */ - LLVMValueRef vdata = NULL; - - switch (num_comps) { - case 1: /* as i32 */ - vdata = out[0]; - break; - case 2: /* as v2i32 */ - case 3: /* as v4i32 (aligned to 4) */ - case 4: /* as v4i32 */ - vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); - for (j = 0; j < num_comps; j++) { - vdata = LLVMBuildInsertElement(builder, vdata, out[j], - LLVMConstInt(ctx->i32, j, 0), ""); - } - break; - } + if (stream != so->output[i].stream) + continue; - LLVMValueRef can_emit_stream = - LLVMBuildICmp(builder, LLVMIntEQ, - stream_id, - lp_build_const_int32(gallivm, stream), ""); - - lp_build_if(&if_ctx_stream, gallivm, can_emit_stream); - build_tbuffer_store_dwords(ctx, so_buffers[buf_idx], - vdata, num_comps, - so_write_offset[buf_idx], - LLVMConstInt(ctx->i32, 0, 0), - so->output[i].dst_offset*4); - lp_build_endif(&if_ctx_stream); + emit_streamout_output(ctx, so_buffers, so_write_offset, + &so->output[i], &outputs[reg]); } } lp_build_endif(&if_ctx); @@ -2306,8 +2171,7 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader *shader = ctx->shader; struct lp_build_context *base = &bld_base->base; - struct lp_build_context *uint = - &ctx->soa.bld_base.uint_bld; + struct lp_build_context *uint = &ctx->bld_base.uint_bld; LLVMValueRef args[9]; LLVMValueRef pos_args[4][9] = { { 0 } }; LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; @@ -2317,13 +2181,35 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, unsigned pos_idx; int i; - if (outputs && ctx->shader->selector->so.num_outputs) { - si_llvm_emit_streamout(ctx, outputs, noutput); - } - for (i = 0; i < noutput; i++) { - semantic_name = outputs[i].name; - semantic_index = outputs[i].sid; + semantic_name = outputs[i].semantic_name; + semantic_index = outputs[i].semantic_index; + bool export_param = true; + + switch (semantic_name) { + case TGSI_SEMANTIC_POSITION: /* ignore these */ + case TGSI_SEMANTIC_PSIZE: + case TGSI_SEMANTIC_CLIPVERTEX: + case TGSI_SEMANTIC_EDGEFLAG: + break; + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_CLIPDIST: + if (shader->key.opt.hw_vs.kill_outputs & + (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index))) + export_param = false; + break; + default: + if (shader->key.opt.hw_vs.kill_outputs2 & + (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index))) + export_param = false; + break; + } + + if (outputs[i].vertex_stream[0] != 0 && + outputs[i].vertex_stream[1] != 0 && + outputs[i].vertex_stream[2] != 0 && + outputs[i].vertex_stream[3] != 0) + export_param = false; handle_semantic: /* Select the correct target */ @@ -2345,23 +2231,26 @@ handle_semantic: case TGSI_SEMANTIC_POSITION: target = V_008DFC_SQ_EXP_POS; break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - target = V_008DFC_SQ_EXP_PARAM + param_count; - assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[i] = param_count; - param_count++; - break; case TGSI_SEMANTIC_CLIPDIST: + if (shader->key.opt.hw_vs.clip_disable) { + semantic_name = TGSI_SEMANTIC_GENERIC; + goto handle_semantic; + } target = V_008DFC_SQ_EXP_POS + 2 + semantic_index; break; case TGSI_SEMANTIC_CLIPVERTEX: + if (shader->key.opt.hw_vs.clip_disable) + continue; si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values); continue; + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: case TGSI_SEMANTIC_PRIMID: case TGSI_SEMANTIC_FOG: case TGSI_SEMANTIC_TEXCOORD: case TGSI_SEMANTIC_GENERIC: + if (!export_param) + continue; target = V_008DFC_SQ_EXP_PARAM + param_count; assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); shader->info.vs_output_param_offset[i] = param_count; @@ -2473,6 +2362,10 @@ handle_semantic: } } +/** + * Forward all outputs from the vertex shader to the TES. This is only used + * for the fixed function TCS. + */ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); @@ -2484,7 +2377,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buffer = build_indexed_load_const(ctx, rw_buffers, + buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); @@ -2495,7 +2388,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) lds_base = get_tcs_in_current_patch_offset(ctx); lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, ""); - inputs = ctx->shader->key.tcs.epilog.inputs_to_copy; + inputs = ctx->shader->key.mono.tcs.inputs_to_copy; while (inputs) { unsigned i = u_bit_scan64(&inputs); @@ -2510,8 +2403,8 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0, lds_ptr); - build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr, - buffer_offset, 0); + ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 4, buffer_addr, + buffer_offset, 0); } } @@ -2542,7 +2435,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, invocation_id, bld_base->uint_bld.zero, "")); /* Determine the layout of one tess factor element in the buffer. */ - switch (shader->key.tcs.epilog.prim_mode) { + switch (shader->key.part.tcs.epilog.prim_mode) { case PIPE_PRIM_LINES: stride = 2; /* 2 dwords, 1 vec2 store */ outer_comps = 2; @@ -2577,10 +2470,18 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, lp_build_const_int32(gallivm, tess_outer_index * 4), ""); - for (i = 0; i < outer_comps; i++) - out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); - for (i = 0; i < inner_comps; i++) - out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { + /* For isolines, the hardware expects tess factors in the + * reverse order from what GLSL / TGSI specify. + */ + out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); + out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); + } else { + for (i = 0; i < outer_comps; i++) + out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); + for (i = 0; i < inner_comps; i++) + out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + } /* Convert the outputs to vectors for stores. */ vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); @@ -2592,7 +2493,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, /* Get the buffer. */ rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buffer = build_indexed_load_const(ctx, rw_buffers, + buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR)); /* Get the offset. */ @@ -2606,18 +2507,18 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, rel_patch_id, bld_base->uint_bld.zero, "")); /* Store the dynamic HS control word. */ - build_tbuffer_store_dwords(ctx, buffer, - lp_build_const_int32(gallivm, 0x80000000), - 1, lp_build_const_int32(gallivm, 0), tf_base, 0); + ac_build_tbuffer_store_dwords(&ctx->ac, buffer, + lp_build_const_int32(gallivm, 0x80000000), + 1, lp_build_const_int32(gallivm, 0), tf_base, 0); lp_build_endif(&inner_if_ctx); /* Store the tessellation factors. */ - build_tbuffer_store_dwords(ctx, buffer, vec0, - MIN2(stride, 4), byteoffset, tf_base, 4); + ac_build_tbuffer_store_dwords(&ctx->ac, buffer, vec0, + MIN2(stride, 4), byteoffset, tf_base, 4); if (vec1) - build_tbuffer_store_dwords(ctx, buffer, vec1, - stride - 4, byteoffset, tf_base, 20); + ac_build_tbuffer_store_dwords(&ctx->ac, buffer, vec1, + stride - 4, byteoffset, tf_base, 20); lp_build_endif(&if_ctx); } @@ -2627,50 +2528,46 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + si_copy_tcs_inputs(bld_base); + rel_patch_id = get_rel_patch_id(ctx); invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); - if (!ctx->no_epilog) { - /* Return epilog parameters from this function. */ - LLVMBuilderRef builder = bld_base->base.gallivm->builder; - LLVMValueRef ret = ctx->return_value; - LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; - unsigned vgpr; - - /* RW_BUFFERS pointer */ - rw_buffers = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); - rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); - rw0 = LLVMBuildExtractElement(builder, rw_buffers, - bld_base->uint_bld.zero, ""); - rw1 = LLVMBuildExtractElement(builder, rw_buffers, - bld_base->uint_bld.one, ""); - ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); - ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); - - /* Tess factor buffer soffset is after user SGPRs. */ - tf_soffset = LLVMGetParam(ctx->main_fn, - SI_PARAM_TESS_FACTOR_OFFSET); - ret = LLVMBuildInsertValue(builder, ret, tf_soffset, - SI_TCS_NUM_USER_SGPR + 1, ""); - - /* VGPRs */ - rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); - invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); - tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); - - vgpr = SI_TCS_NUM_USER_SGPR + 2; - ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); - ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); - ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); - ctx->return_value = ret; - return; - } + /* Return epilog parameters from this function. */ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef ret = ctx->return_value; + LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; + unsigned vgpr; - si_copy_tcs_inputs(bld_base); - si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset); + /* RW_BUFFERS pointer */ + rw_buffers = LLVMGetParam(ctx->main_fn, + SI_PARAM_RW_BUFFERS); + rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); + rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); + rw0 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.zero, ""); + rw1 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.one, ""); + ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); + ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); + + /* Tess factor buffer soffset is after user SGPRs. */ + tf_soffset = LLVMGetParam(ctx->main_fn, + SI_PARAM_TESS_FACTOR_OFFSET); + ret = LLVMBuildInsertValue(builder, ret, tf_soffset, + SI_TCS_NUM_USER_SGPR + 1, ""); + + /* VGPRs */ + rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); + invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); + tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); + + vgpr = SI_TCS_NUM_USER_SGPR + 2; + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + ctx->return_value = ret; } static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) @@ -2690,7 +2587,7 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) /* Write outputs to LDS. The next shader (TCS aka HS) will read * its inputs from it. */ for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = ctx->soa.outputs[i]; + LLVMValueRef *out_ptr = ctx->outputs[i]; unsigned name = info->output_semantic_name[i]; unsigned index = info->output_semantic_index[i]; int param = si_shader_io_get_unique_index(name, index); @@ -2716,8 +2613,7 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) int i; for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = - ctx->soa.outputs[i]; + LLVMValueRef *out_ptr = ctx->outputs[i]; int param_index; if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || @@ -2731,14 +2627,14 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); - build_tbuffer_store(ctx, - ctx->esgs_ring, - out_val, 1, - LLVMGetUndef(ctx->i32), soffset, - (4 * param_index + chan) * 4, - V_008F0C_BUF_DATA_FORMAT_32, - V_008F0C_BUF_NUM_FORMAT_UINT, - 0, 0, 1, 1, 0); + ac_build_tbuffer_store(&ctx->ac, + ctx->esgs_ring, + out_val, 1, + LLVMGetUndef(ctx->i32), soffset, + (4 * param_index + chan) * 4, + V_008F0C_BUF_DATA_FORMAT_32, + V_008F0C_BUF_NUM_FORMAT_UINT, + 0, 0, 1, 1, 0); } } } @@ -2746,13 +2642,9 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMValueRef args[2]; - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE); - args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); - lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", - ctx->voidt, args, 2, 0); + ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, + LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); } static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) @@ -2763,7 +2655,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) struct si_shader_output_values *outputs = NULL; int i,j; - assert(!ctx->is_gs_copy_shader); + assert(!ctx->shader->is_gs_copy_shader); outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); @@ -2794,7 +2686,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) } for (j = 0; j < 4; j++) { - addr = ctx->soa.outputs[i][j]; + addr = ctx->outputs[i][j]; val = LLVMBuildLoad(gallivm->builder, addr, ""); val = si_llvm_saturate(bld_base, val); LLVMBuildStore(gallivm->builder, val, addr); @@ -2806,38 +2698,30 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) } for (i = 0; i < info->num_outputs; i++) { - outputs[i].name = info->output_semantic_name[i]; - outputs[i].sid = info->output_semantic_index[i]; + outputs[i].semantic_name = info->output_semantic_name[i]; + outputs[i].semantic_index = info->output_semantic_index[i]; - for (j = 0; j < 4; j++) + for (j = 0; j < 4; j++) { outputs[i].values[j] = LLVMBuildLoad(gallivm->builder, - ctx->soa.outputs[i][j], + ctx->outputs[i][j], ""); - } - - if (ctx->no_epilog) { - /* Export PrimitiveID when PS needs it. */ - if (si_vs_exports_prim_id(ctx->shader)) { - outputs[i].name = TGSI_SEMANTIC_PRIMID; - outputs[i].sid = 0; - outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - get_primitive_id(bld_base, 0)); - outputs[i].values[1] = bld_base->base.undef; - outputs[i].values[2] = bld_base->base.undef; - outputs[i].values[3] = bld_base->base.undef; - i++; + outputs[i].vertex_stream[j] = + (info->output_streams[i] >> (2 * j)) & 3; } - } else { - /* Return the primitive ID from the LLVM function. */ - ctx->return_value = - LLVMBuildInsertValue(gallivm->builder, - ctx->return_value, - bitcast(bld_base, TGSI_TYPE_FLOAT, - get_primitive_id(bld_base, 0)), - VS_EPILOG_PRIMID_LOC, ""); + } + /* Return the primitive ID from the LLVM function. */ + ctx->return_value = + LLVMBuildInsertValue(gallivm->builder, + ctx->return_value, + bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)), + VS_EPILOG_PRIMID_LOC, ""); + + if (ctx->shader->selector->so.num_outputs) + si_llvm_emit_streamout(ctx, outputs, i, 0); si_llvm_export_vs(bld_base, outputs, i); FREE(outputs); } @@ -2925,10 +2809,11 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, } } - /* SI (except OLAND) has a bug that it only looks + /* SI (except OLAND and HAINAN) has a bug that it only looks * at the X writemask component. */ if (ctx->screen->b.chip_class == SI && - ctx->screen->b.family != CHIP_OLAND) + ctx->screen->b.family != CHIP_OLAND && + ctx->screen->b.family != CHIP_HAINAN) mask |= 0x1; /* Specify which components to enable */ @@ -2947,31 +2832,31 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, int i; /* Clamp color */ - if (ctx->shader->key.ps.epilog.clamp_color) + if (ctx->shader->key.part.ps.epilog.clamp_color) for (i = 0; i < 4; i++) color[i] = si_llvm_saturate(bld_base, color[i]); /* Alpha to one */ - if (ctx->shader->key.ps.epilog.alpha_to_one) + if (ctx->shader->key.part.ps.epilog.alpha_to_one) color[3] = base->one; /* Alpha test */ if (index == 0 && - ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) + ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) si_alpha_test(bld_base, color[3]); /* Line & polygon smoothing */ - if (ctx->shader->key.ps.epilog.poly_line_smoothing) + if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], samplemask_param); /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (ctx->shader->key.ps.epilog.last_cbuf > 0) { + if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { LLVMValueRef args[8][9]; int c, last = -1; /* Get the export arguments, also find out what the last one is. */ - for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { + for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + c, args[c]); if (args[c][0] != bld_base->uint_bld.zero) @@ -2979,7 +2864,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, } /* Emit all exports. */ - for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { + for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { if (is_last && last == c) { args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ args[c][2] = bld_base->uint_bld.one; /* DONE bit */ @@ -3034,98 +2919,6 @@ static void si_export_null(struct lp_build_tgsi_context *bld_base) ctx->voidt, args, 9, 0); } -static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct si_shader *shader = ctx->shader; - struct lp_build_context *base = &bld_base->base; - struct tgsi_shader_info *info = &shader->selector->info; - LLVMBuilderRef builder = base->gallivm->builder; - LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; - int last_color_export = -1; - int i; - struct si_ps_exports exp = {}; - - /* Determine the last export. If MRTZ is present, it's always last. - * Otherwise, find the last color export. - */ - if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) { - unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format; - - /* Don't export NULL and return if alpha-test is enabled. */ - if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS && - shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER && - (spi_format & 0xf) == 0) - spi_format |= V_028714_SPI_SHADER_32_AR; - - for (i = 0; i < info->num_outputs; i++) { - unsigned index = info->output_semantic_index[i]; - - if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR) - continue; - - /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (shader->key.ps.epilog.last_cbuf > 0) { - /* Just set this if any of the colorbuffers are enabled. */ - if (spi_format & - ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1)) - last_color_export = i; - continue; - } - - if ((spi_format >> (index * 4)) & 0xf) - last_color_export = i; - } - - /* If there are no outputs, export NULL. */ - if (last_color_export == -1) { - si_export_null(bld_base); - return; - } - } - - for (i = 0; i < info->num_outputs; i++) { - unsigned semantic_name = info->output_semantic_name[i]; - unsigned semantic_index = info->output_semantic_index[i]; - unsigned j; - LLVMValueRef color[4] = {}; - - /* Select the correct target */ - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: - depth = LLVMBuildLoad(builder, - ctx->soa.outputs[i][2], ""); - break; - case TGSI_SEMANTIC_STENCIL: - stencil = LLVMBuildLoad(builder, - ctx->soa.outputs[i][1], ""); - break; - case TGSI_SEMANTIC_SAMPLEMASK: - samplemask = LLVMBuildLoad(builder, - ctx->soa.outputs[i][0], ""); - break; - case TGSI_SEMANTIC_COLOR: - for (j = 0; j < 4; j++) - color[j] = LLVMBuildLoad(builder, - ctx->soa.outputs[i][j], ""); - - si_export_mrt_color(bld_base, color, semantic_index, - SI_PARAM_SAMPLE_COVERAGE, - last_color_export == i, &exp); - break; - default: - fprintf(stderr, - "Warning: SI unhandled fs output type:%d\n", - semantic_name); - } - } - - if (depth || stencil || samplemask) - si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); - - si_emit_ps_exports(ctx, &exp); -} - /** * Return PS outputs in this order: * @@ -3161,22 +2954,22 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) case TGSI_SEMANTIC_COLOR: assert(semantic_index < 8); for (j = 0; j < 4; j++) { - LLVMValueRef ptr = ctx->soa.outputs[i][j]; + LLVMValueRef ptr = ctx->outputs[i][j]; LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); color[semantic_index][j] = result; } break; case TGSI_SEMANTIC_POSITION: depth = LLVMBuildLoad(builder, - ctx->soa.outputs[i][2], ""); + ctx->outputs[i][2], ""); break; case TGSI_SEMANTIC_STENCIL: stencil = LLVMBuildLoad(builder, - ctx->soa.outputs[i][1], ""); + ctx->outputs[i][1], ""); break; case TGSI_SEMANTIC_SAMPLEMASK: samplemask = LLVMBuildLoad(builder, - ctx->soa.outputs[i][0], ""); + ctx->outputs[i][0], ""); break; default: fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", @@ -3233,7 +3026,7 @@ static LLVMValueRef get_buffer_size( LLVMBuilderRef builder = gallivm->builder; LLVMValueRef size = LLVMBuildExtractElement(builder, descriptor, - lp_build_const_int32(gallivm, 6), ""); + lp_build_const_int32(gallivm, 2), ""); if (ctx->screen->b.chip_class >= VI) { /* On VI, the descriptor contains the size in bytes, @@ -3242,7 +3035,7 @@ static LLVMValueRef get_buffer_size( */ LLVMValueRef stride = LLVMBuildExtractElement(builder, descriptor, - lp_build_const_int32(gallivm, 5), ""); + lp_build_const_int32(gallivm, 1), ""); stride = LLVMBuildLShr(builder, stride, lp_build_const_int32(gallivm, 16), ""); stride = LLVMBuildAnd(builder, stride, @@ -3301,6 +3094,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, * point in the program by emitting empty inline assembly that is marked as * having side effects. */ +#if 0 /* unused currently */ static void emit_optimization_barrier(struct si_shader_context *ctx) { LLVMBuilderRef builder = ctx->gallivm.builder; @@ -3308,13 +3102,19 @@ static void emit_optimization_barrier(struct si_shader_context *ctx) LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false); LLVMBuildCall(builder, inlineasm, NULL, 0, ""); } +#endif + +/* Combine these with & instead of |. */ +#define NOOP_WAITCNT 0xf7f +#define LGKM_CNT 0x07f +#define VM_CNT 0xf70 -static void emit_waitcnt(struct si_shader_context *ctx) +static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16) { struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef args[1] = { - lp_build_const_int32(gallivm, 0xf70) + lp_build_const_int32(gallivm, simm16) }; lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0); @@ -3326,8 +3126,23 @@ static void membar_emit( struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); + LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); + unsigned flags = LLVMConstIntGetZExtValue(src0); + unsigned waitcnt = NOOP_WAITCNT; - emit_waitcnt(ctx); + if (flags & TGSI_MEMBAR_THREAD_GROUP) + waitcnt &= VM_CNT & LGKM_CNT; + + if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | + TGSI_MEMBAR_SHADER_BUFFER | + TGSI_MEMBAR_SHADER_IMAGE)) + waitcnt &= VM_CNT; + + if (flags & TGSI_MEMBAR_SHARED) + waitcnt &= LGKM_CNT; + + if (waitcnt != NOOP_WAITCNT) + emit_waitcnt(ctx, waitcnt); } static LLVMValueRef @@ -3345,7 +3160,7 @@ shader_buffer_fetch_rsrc(struct si_shader_context *ctx, reg->Register.Index, SI_NUM_SHADER_BUFFERS); - return build_indexed_load_const(ctx, rsrc_ptr, index); + return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index); } static bool tgsi_is_array_sampler(unsigned target) @@ -3397,6 +3212,12 @@ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, } } +static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) +{ + return LLVMPointerType(LLVMArrayType(elem_type, num_elements), + CONST_ADDR_SPACE); +} + /** * Load the resource descriptor for \p image. */ @@ -3404,13 +3225,14 @@ static void image_fetch_rsrc( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *image, - bool dcc_off, + bool is_store, unsigned target, LLVMValueRef *rsrc) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_IMAGES); LLVMValueRef index, tmp; + bool dcc_off = target != TGSI_TEXTURE_BUFFER && is_store; assert(image->Register.File == TGSI_FILE_IMAGE); @@ -3420,7 +3242,7 @@ image_fetch_rsrc( index = LLVMConstInt(ctx->i32, image->Register.Index, 0); if (info->images_writemask & (1 << image->Register.Index) && - !(info->images_buffers & (1 << image->Register.Index))) + target != TGSI_TEXTURE_BUFFER) dcc_off = true; } else { /* From the GL_ARB_shader_image_load_store extension spec: @@ -3437,7 +3259,20 @@ image_fetch_rsrc( SI_NUM_IMAGES); } - tmp = build_indexed_load_const(ctx, rsrc_ptr, index); + if (target == TGSI_TEXTURE_BUFFER) { + LLVMBuilderRef builder = ctx->gallivm.builder; + + rsrc_ptr = LLVMBuildPointerCast(builder, rsrc_ptr, + const_array(ctx->v4i32, 0), ""); + index = LLVMBuildMul(builder, index, + LLVMConstInt(ctx->i32, 2, 0), ""); + index = LLVMBuildAdd(builder, index, + LLVMConstInt(ctx->i32, 1, 0), ""); + *rsrc = ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index); + return; + } + + tmp = ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index); if (dcc_off) tmp = force_dcc_off(ctx, tmp); *rsrc = tmp; @@ -3481,7 +3316,8 @@ static void image_append_args( struct si_shader_context *ctx, struct lp_build_emit_data * emit_data, unsigned target, - bool atomic) + bool atomic, + bool force_glc) { const struct tgsi_full_instruction *inst = emit_data->inst; LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); @@ -3489,6 +3325,7 @@ static void image_append_args( LLVMValueRef r128 = i1false; LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false; LLVMValueRef glc = + force_glc || inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? i1true : i1false; LLVMValueRef slc = i1false; @@ -3511,25 +3348,6 @@ static void image_append_args( emit_data->args[emit_data->arg_count++] = da; } -/** - * Given a 256 bit resource, extract the top half (which stores the buffer - * resource in the case of textures and images). - */ -static LLVMValueRef extract_rsrc_top_half( - struct si_shader_context *ctx, - LLVMValueRef rsrc) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; - LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); - - rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, ""); - rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, ""); - rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""); - - return rsrc; -} - /** * Append the resource and indexing arguments for buffer intrinsics. * @@ -3543,7 +3361,8 @@ static void buffer_append_args( LLVMValueRef rsrc, LLVMValueRef index, LLVMValueRef offset, - bool atomic) + bool atomic, + bool force_glc) { const struct tgsi_full_instruction *inst = emit_data->inst; LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); @@ -3554,6 +3373,7 @@ static void buffer_append_args( emit_data->args[emit_data->arg_count++] = offset; /* voffset */ if (!atomic) { emit_data->args[emit_data->arg_count++] = + force_glc || inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? i1true : i1false; /* glc */ } @@ -3583,24 +3403,23 @@ static void load_fetch_args( offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, false); + offset, false, false); } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { LLVMValueRef coords; - image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc); + image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc); coords = image_fetch_coords(bld_base, inst, 1); if (target == TGSI_TEXTURE_BUFFER) { - rsrc = extract_rsrc_top_half(ctx, rsrc); buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, false); + bld_base->uint_bld.zero, false, false); } else { emit_data->args[0] = coords; emit_data->args[1] = rsrc; emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ emit_data->arg_count = 3; - image_append_args(ctx, emit_data, target, false); + image_append_args(ctx, emit_data, target, false, false); } } } @@ -3634,7 +3453,7 @@ static void load_emit_buffer(struct si_shader_context *ctx, emit_data->output[emit_data->chan] = lp_build_intrinsic( builder, intrinsic_name, dst_type, emit_data->args, emit_data->arg_count, - LLVMReadOnlyAttribute); + LP_FUNC_ATTR_READONLY); } static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx, @@ -3646,7 +3465,7 @@ static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx, LLVMValueRef offset, ptr; int addr_space; - offset = lp_build_emit_fetch(&ctx->soa.bld_base, inst, arg, 0); + offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0); offset = LLVMBuildBitCast(builder, offset, ctx->i32, ""); ptr = ctx->shared_memory; @@ -3662,7 +3481,7 @@ static void load_emit_memory( struct lp_build_emit_data *emit_data) { const struct tgsi_full_instruction *inst = emit_data->inst; - struct lp_build_context *base = &ctx->soa.bld_base.base; + struct lp_build_context *base = &ctx->bld_base.base; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; unsigned writemask = inst->Dst[0].Register.WriteMask; @@ -3727,7 +3546,7 @@ static void load_emit( } if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - emit_waitcnt(ctx); + emit_waitcnt(ctx, VM_CNT); if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { load_emit_buffer(ctx, emit_data); @@ -3739,7 +3558,7 @@ static void load_emit( lp_build_intrinsic( builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadOnlyAttribute); + LP_FUNC_ATTR_READONLY); } else { get_image_intr_name("llvm.amdgcn.image.load", emit_data->dst_type, /* vdata */ @@ -3751,7 +3570,7 @@ static void load_emit( lp_build_intrinsic( builder, intrinsic_name, emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadOnlyAttribute); + LP_FUNC_ATTR_READONLY); } } @@ -3790,26 +3609,33 @@ static void store_fetch_args( offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, false); + offset, false, false); } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) { unsigned target = inst->Memory.Texture; LLVMValueRef coords; + /* 8bit/16bit TC L1 write corruption bug on SI. + * All store opcodes not aligned to a dword are affected. + * + * The only way to get unaligned stores in radeonsi is through + * shader images. + */ + bool force_glc = ctx->screen->b.chip_class == SI; + coords = image_fetch_coords(bld_base, inst, 0); if (target == TGSI_TEXTURE_BUFFER) { - image_fetch_rsrc(bld_base, &memory, false, &rsrc); - - rsrc = extract_rsrc_top_half(ctx, rsrc); + image_fetch_rsrc(bld_base, &memory, true, target, &rsrc); buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, false); + bld_base->uint_bld.zero, false, force_glc); } else { emit_data->args[1] = coords; - image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]); + image_fetch_rsrc(bld_base, &memory, true, target, + &emit_data->args[2]); emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ emit_data->arg_count = 4; - image_append_args(ctx, emit_data, target, false); + image_append_args(ctx, emit_data, target, false, force_glc); } } } @@ -3821,7 +3647,7 @@ static void store_emit_buffer( const struct tgsi_full_instruction *inst = emit_data->inst; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; - struct lp_build_context *uint_bld = &ctx->soa.bld_base.uint_bld; + struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld; LLVMValueRef base_data = emit_data->args[0]; LLVMValueRef base_offset = emit_data->args[3]; unsigned writemask = inst->Dst[0].Register.WriteMask; @@ -3892,7 +3718,7 @@ static void store_emit_memory( { const struct tgsi_full_instruction *inst = emit_data->inst; struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_context *base = &ctx->soa.bld_base.base; + struct lp_build_context *base = &ctx->bld_base.base; LLVMBuilderRef builder = gallivm->builder; unsigned writemask = inst->Dst[0].Register.WriteMask; LLVMValueRef ptr, derived_ptr, data, index; @@ -3904,7 +3730,7 @@ static void store_emit_memory( if (!(writemask & (1 << chan))) { continue; } - data = lp_build_emit_fetch(&ctx->soa.bld_base, inst, 1, chan); + data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan); index = lp_build_const_int32(gallivm, chan); derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); LLVMBuildStore(builder, data, derived_ptr); @@ -3929,7 +3755,7 @@ static void store_emit( } if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - emit_waitcnt(ctx); + emit_waitcnt(ctx, VM_CNT); if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { store_emit_buffer(ctx, emit_data); @@ -3993,24 +3819,22 @@ static void atomic_fetch_args( offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, true); + offset, true, false); } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { unsigned target = inst->Memory.Texture; LLVMValueRef coords; - image_fetch_rsrc(bld_base, &inst->Src[0], - target != TGSI_TEXTURE_BUFFER, &rsrc); + image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc); coords = image_fetch_coords(bld_base, inst, 1); if (target == TGSI_TEXTURE_BUFFER) { - rsrc = extract_rsrc_top_half(ctx, rsrc); buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, true); + bld_base->uint_bld.zero, true, false); } else { emit_data->args[emit_data->arg_count++] = coords; emit_data->args[emit_data->arg_count++] = rsrc; - image_append_args(ctx, emit_data, target, true); + image_append_args(ctx, emit_data, target, true, false); } } } @@ -4024,12 +3848,12 @@ static void atomic_emit_memory(struct si_shader_context *ctx, ptr = get_memory_ptr(ctx, inst, ctx->i32, 1); - arg = lp_build_emit_fetch(&ctx->soa.bld_base, inst, 2, 0); + arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0); arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { LLVMValueRef new_data; - new_data = lp_build_emit_fetch(&ctx->soa.bld_base, + new_data = lp_build_emit_fetch(&ctx->bld_base, inst, 3, 0); new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, ""); @@ -4142,11 +3966,13 @@ static void resq_fetch_args( emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg); emit_data->arg_count = 1; } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]); + image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, + &emit_data->args[0]); emit_data->arg_count = 1; } else { emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */ - image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]); + image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, + &emit_data->args[1]); emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */ emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */ @@ -4179,7 +4005,7 @@ static void resq_emit( out = lp_build_intrinsic( builder, "llvm.SI.getresinfo.i32", emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); /* Divide the number of layers by 6 to get the number of cubes. */ if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) { @@ -4245,16 +4071,11 @@ static const struct lp_build_tgsi_action tex_action; enum desc_type { DESC_IMAGE, + DESC_BUFFER, DESC_FMASK, - DESC_SAMPLER + DESC_SAMPLER, }; -static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) -{ - return LLVMPointerType(LLVMArrayType(elem_type, num_elements), - CONST_ADDR_SPACE); -} - /** * Load an image view, fmask view. or sampler state descriptor. */ @@ -4270,6 +4091,13 @@ static LLVMValueRef load_sampler_desc_custom(struct si_shader_context *ctx, /* The image is at [0:7]. */ index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); break; + case DESC_BUFFER: + /* The buffer is in [4:7]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); + index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), ""); + list = LLVMBuildPointerCast(builder, list, + const_array(ctx->v4i32, 0), ""); + break; case DESC_FMASK: /* The FMASK is at [8:15]. */ index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); @@ -4284,7 +4112,7 @@ static LLVMValueRef load_sampler_desc_custom(struct si_shader_context *ctx, break; } - return build_indexed_load_const(ctx, list, index); + return ac_build_indexed_load_const(&ctx->ac, list, index); } static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx, @@ -4351,21 +4179,25 @@ static void tex_fetch_ptrs( index = LLVMConstInt(ctx->i32, sampler_index, 0); } - *res_ptr = load_sampler_desc(ctx, index, DESC_IMAGE); + if (target == TGSI_TEXTURE_BUFFER) + *res_ptr = load_sampler_desc(ctx, index, DESC_BUFFER); + else + *res_ptr = load_sampler_desc(ctx, index, DESC_IMAGE); + + if (samp_ptr) + *samp_ptr = NULL; + if (fmask_ptr) + *fmask_ptr = NULL; if (target == TGSI_TEXTURE_2D_MSAA || target == TGSI_TEXTURE_2D_ARRAY_MSAA) { - if (samp_ptr) - *samp_ptr = NULL; if (fmask_ptr) *fmask_ptr = load_sampler_desc(ctx, index, DESC_FMASK); - } else { + } else if (target != TGSI_TEXTURE_BUFFER) { if (samp_ptr) { *samp_ptr = load_sampler_desc(ctx, index, DESC_SAMPLER); *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); } - if (fmask_ptr) - *fmask_ptr = NULL; } } @@ -4374,8 +4206,6 @@ static void txq_fetch_args( struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_instruction *inst = emit_data->inst; unsigned target = inst->Texture.Texture; LLVMValueRef res_ptr; @@ -4385,8 +4215,7 @@ static void txq_fetch_args( if (target == TGSI_TEXTURE_BUFFER) { /* Read the size from the buffer descriptor directly. */ - LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); - emit_data->args[0] = get_buffer_size(bld_base, res); + emit_data->args[0] = get_buffer_size(bld_base, res_ptr); return; } @@ -4413,7 +4242,7 @@ static void txq_emit(const struct lp_build_tgsi_action *action, emit_data->output[emit_data->chan] = lp_build_intrinsic( base->gallivm->builder, "llvm.SI.getresinfo.i32", emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); /* Divide the number of layers by 6 to get the number of cubes. */ if (target == TGSI_TEXTURE_CUBE_ARRAY || @@ -4454,16 +4283,9 @@ static void tex_fetch_args( tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); if (target == TGSI_TEXTURE_BUFFER) { - LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); - - /* Bitcast and truncate v8i32 to v16i8. */ - LLVMValueRef res = res_ptr; - res = LLVMBuildBitCast(gallivm->builder, res, v2i128, ""); - res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, ""); - res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, ""); - emit_data->dst_type = ctx->v4f32; - emit_data->args[0] = res; + emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr, + ctx->v16i8, ""); emit_data->args[1] = bld_base->uint_bld.zero; emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); emit_data->arg_count = 3; @@ -4587,7 +4409,11 @@ static void tex_fetch_args( target == TGSI_TEXTURE_CUBE_ARRAY || target == TGSI_TEXTURE_SHADOWCUBE || target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) - si_prepare_cube_coords(bld_base, emit_data, coords, derivs); + ac_prepare_cube_coords(&ctx->ac, + opcode == TGSI_OPCODE_TXD, + target == TGSI_TEXTURE_CUBE_ARRAY || + target == TGSI_TEXTURE_SHADOWCUBE_ARRAY, + coords, derivs); if (opcode == TGSI_OPCODE_TXD) for (int i = 0; i < num_deriv_channels * 2; i++) @@ -4635,16 +4461,12 @@ static void tex_fetch_args( struct lp_build_context *uint_bld = &bld_base->uint_bld; struct lp_build_emit_data txf_emit_data = *emit_data; LLVMValueRef txf_address[4]; - unsigned txf_count = count; + /* We only need .xy for non-arrays, and .xyz for arrays. */ + unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3; struct tgsi_full_instruction inst = {}; memcpy(txf_address, address, sizeof(txf_address)); - if (target == TGSI_TEXTURE_2D_MSAA) { - txf_address[2] = bld_base->uint_bld.zero; - } - txf_address[3] = bld_base->uint_bld.zero; - /* Read FMASK using TXF. */ inst.Instruction.Opcode = TGSI_OPCODE_TXF; inst.Texture.Texture = target; @@ -4665,7 +4487,7 @@ static void tex_fetch_args( txf_emit_data.output[0], uint_bld->zero, ""); - unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3; + unsigned sample_chan = txf_count; /* the sample index is last */ LLVMValueRef sample_index4 = LLVMBuildMul(gallivm->builder, address[sample_chan], four, ""); @@ -4701,7 +4523,6 @@ static void tex_fetch_args( /* add tex offsets */ if (inst->Texture.NumOffsets) { struct lp_build_context *uint_bld = &bld_base->uint_bld; - struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); const struct tgsi_texture_offset *off = inst->TexOffsets; assert(inst->Texture.NumOffsets == 1); @@ -4709,7 +4530,7 @@ static void tex_fetch_args( switch (target) { case TGSI_TEXTURE_3D: address[2] = lp_build_add(uint_bld, address[2], - bld->immediates[off->Index][off->SwizzleZ]); + ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]); /* fall through */ case TGSI_TEXTURE_2D: case TGSI_TEXTURE_SHADOW2D: @@ -4719,7 +4540,7 @@ static void tex_fetch_args( case TGSI_TEXTURE_SHADOW2D_ARRAY: address[1] = lp_build_add(uint_bld, address[1], - bld->immediates[off->Index][off->SwizzleY]); + ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]); /* fall through */ case TGSI_TEXTURE_1D: case TGSI_TEXTURE_SHADOW1D: @@ -4727,7 +4548,7 @@ static void tex_fetch_args( case TGSI_TEXTURE_SHADOW1D_ARRAY: address[0] = lp_build_add(uint_bld, address[0], - bld->immediates[off->Index][off->SwizzleX]); + ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]); break; /* texture offsets do not apply to other texture targets */ } @@ -4747,13 +4568,12 @@ static void tex_fetch_args( /* Get the component index from src1.x for Gather4. */ if (!tgsi_is_shadow_target(target)) { - LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates; LLVMValueRef comp_imm; struct tgsi_src_register src1 = inst->Src[1].Register; assert(src1.File == TGSI_FILE_IMMEDIATE); - comp_imm = imms[src1.Index][src1.SwizzleX]; + comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX]; gather_comp = LLVMConstIntGetZExtValue(comp_imm); gather_comp = CLAMP(gather_comp, 0, 3); } @@ -4798,9 +4618,9 @@ static void si_lower_gather4_integer(struct si_shader_context *ctx, set_tex_fetch_args(ctx, &txq_emit_data, TGSI_OPCODE_TXQ, txq_inst.Texture.Texture, emit_data->args[1], NULL, - &ctx->soa.bld_base.uint_bld.zero, + &ctx->bld_base.uint_bld.zero, 1, 0xf); - txq_emit(NULL, &ctx->soa.bld_base, &txq_emit_data); + txq_emit(NULL, &ctx->bld_base, &txq_emit_data); /* Compute -0.5 / size. */ for (c = 0; c < 2; c++) { @@ -4809,7 +4629,7 @@ static void si_lower_gather4_integer(struct si_shader_context *ctx, LLVMConstInt(ctx->i32, c, 0), ""); half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, ""); half_texel[c] = - lp_build_emit_llvm_unary(&ctx->soa.bld_base, + lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP, half_texel[c]); half_texel[c] = LLVMBuildFMul(builder, half_texel[c], LLVMConstReal(ctx->f32, -0.5), ""); @@ -4831,7 +4651,7 @@ static void si_lower_gather4_integer(struct si_shader_context *ctx, emit_data->output[emit_data->chan] = lp_build_intrinsic(builder, intr_name, emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); } static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, @@ -4855,7 +4675,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, base->gallivm->builder, "llvm.SI.vs.load.input", emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); return; } @@ -4932,7 +4752,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, emit_data->output[emit_data->chan] = lp_build_intrinsic( base->gallivm->builder, intr_name, emit_data->dst_type, emit_data->args, emit_data->arg_count, - LLVMReadNoneAttribute); + LP_FUNC_ATTR_READNONE); } static void si_llvm_emit_txqs( @@ -4964,35 +4784,6 @@ static void si_llvm_emit_txqs( emit_data->output[emit_data->chan] = samples; } -/* - * SI implements derivatives using the local data store (LDS) - * All writes to the LDS happen in all executing threads at - * the same time. TID is the Thread ID for the current - * thread and is a value between 0 and 63, representing - * the thread's position in the wavefront. - * - * For the pixel shader threads are grouped into quads of four pixels. - * The TIDs of the pixels of a quad are: - * - * +------+------+ - * |4n + 0|4n + 1| - * +------+------+ - * |4n + 2|4n + 3| - * +------+------+ - * - * So, masking the TID with 0xfffffffc yields the TID of the top left pixel - * of the quad, masking with 0xfffffffd yields the TID of the top pixel of - * the current pixel's column, and masking with 0xfffffffe yields the TID - * of the left pixel of the current pixel's row. - * - * Adding 1 yields the TID of the pixel to the right of the left pixel, and - * adding 2 yields the TID of the pixel below the top pixel. - */ -/* masks for thread ID. */ -#define TID_MASK_TOP_LEFT 0xfffffffc -#define TID_MASK_TOP 0xfffffffd -#define TID_MASK_LEFT 0xfffffffe - static void si_llvm_emit_ddxy( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, @@ -5001,59 +4792,24 @@ static void si_llvm_emit_ddxy( struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; unsigned opcode = emit_data->info->opcode; - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, val, args[2]; + LLVMValueRef val; int idx; unsigned mask; - thread_id = get_thread_id(ctx); - if (opcode == TGSI_OPCODE_DDX_FINE) - mask = TID_MASK_LEFT; + mask = AC_TID_MASK_LEFT; else if (opcode == TGSI_OPCODE_DDY_FINE) - mask = TID_MASK_TOP; + mask = AC_TID_MASK_TOP; else - mask = TID_MASK_TOP_LEFT; - - tl_tid = LLVMBuildAnd(gallivm->builder, thread_id, - lp_build_const_int32(gallivm, mask), ""); + mask = AC_TID_MASK_TOP_LEFT; /* for DDX we want to next X pixel, DDY next Y pixel. */ idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; - trbl_tid = LLVMBuildAdd(gallivm->builder, tl_tid, - lp_build_const_int32(gallivm, idx), ""); val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, ""); - - if (ctx->screen->has_ds_bpermute) { - args[0] = LLVMBuildMul(gallivm->builder, tl_tid, - lp_build_const_int32(gallivm, 4), ""); - args[1] = val; - tl = lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.ds.bpermute", ctx->i32, - args, 2, LLVMReadNoneAttribute); - - args[0] = LLVMBuildMul(gallivm->builder, trbl_tid, - lp_build_const_int32(gallivm, 4), ""); - trbl = lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.ds.bpermute", ctx->i32, - args, 2, LLVMReadNoneAttribute); - } else { - LLVMValueRef store_ptr, load_ptr0, load_ptr1; - - store_ptr = build_gep0(ctx, ctx->lds, thread_id); - load_ptr0 = build_gep0(ctx, ctx->lds, tl_tid); - load_ptr1 = build_gep0(ctx, ctx->lds, trbl_tid); - - LLVMBuildStore(gallivm->builder, val, store_ptr); - tl = LLVMBuildLoad(gallivm->builder, load_ptr0, ""); - trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, ""); - } - - tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, ""); - trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, ""); - - emit_data->output[emit_data->chan] = - LLVMBuildFSub(gallivm->builder, trbl, tl, ""); + val = ac_emit_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute, + mask, idx, ctx->lds, val); + emit_data->output[emit_data->chan] = val; } /* @@ -5131,9 +4887,9 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader *shader = ctx->shader; struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_context *uint = &bld_base->uint_bld; LLVMValueRef interp_param; const struct tgsi_full_instruction *inst = emit_data->inst; - const char *intr_name; int input_index = inst->Src[0].Register.Index; int chan; int i; @@ -5155,7 +4911,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, if (interp_param_idx == -1) return; else if (interp_param_idx) - interp_param = get_interp_param(ctx, interp_param_idx); + interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); else interp_param = NULL; @@ -5194,45 +4950,48 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], ""); - temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, ""); - - ij_out[i] = LLVMBuildBitCast(gallivm->builder, - temp2, ctx->i32, ""); + ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, ""); } interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2); } - intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; for (chan = 0; chan < 4; chan++) { - LLVMValueRef args[4]; LLVMValueRef llvm_chan; unsigned schan; schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); llvm_chan = lp_build_const_int32(gallivm, schan); - args[0] = llvm_chan; - args[1] = attr_number; - args[2] = params; - args[3] = interp_param; - - emit_data->output[chan] = - lp_build_intrinsic(gallivm->builder, intr_name, - ctx->f32, args, args[3] ? 4 : 3, - LLVMReadNoneAttribute); + if (interp_param) { + interp_param = LLVMBuildBitCast(gallivm->builder, + interp_param, LLVMVectorType(ctx->f32, 2), ""); + LLVMValueRef i = LLVMBuildExtractElement( + gallivm->builder, interp_param, uint->zero, ""); + LLVMValueRef j = LLVMBuildExtractElement( + gallivm->builder, interp_param, uint->one, ""); + emit_data->output[chan] = ac_build_fs_interp(&ctx->ac, + llvm_chan, attr_number, params, + i, j); + } else { + emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac, + lp_build_const_int32(gallivm, 2), /* P0 */ + llvm_chan, attr_number, params); + } } } static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { - LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates; + struct si_shader_context *ctx = si_shader_context(bld_base); struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; + LLVMValueRef imm; unsigned stream; assert(src0.File == TGSI_FILE_IMMEDIATE); - stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3; + imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX]; + stream = LLVMConstIntGetZExtValue(imm) & 0x3; return stream; } @@ -5247,12 +5006,12 @@ static void si_llvm_emit_vertex( struct si_shader *shader = ctx->shader; struct tgsi_shader_info *info = &shader->selector->info; struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_if_state if_state; LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, SI_PARAM_GS2VS_OFFSET); LLVMValueRef gs_next_vertex; LLVMValueRef can_emit, kill; - LLVMValueRef args[2]; - unsigned chan; + unsigned chan, offset; int i; unsigned stream; @@ -5264,54 +5023,69 @@ static void si_llvm_emit_vertex( ""); /* If this thread has already emitted the declared maximum number of - * vertices, kill it: excessive vertex emissions are not supposed to - * have any effect, and GS threads have no externally observable - * effects other than emitting vertices. + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + * + * If the shader has no writes to memory, kill it instead. This skips + * further memory loads and may allow LLVM to skip to the end + * altogether. */ - can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex, + can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex, lp_build_const_int32(gallivm, shader->selector->gs_max_out_vertices), ""); - kill = lp_build_select(&bld_base->base, can_emit, - lp_build_const_float(gallivm, 1.0f), - lp_build_const_float(gallivm, -1.0f)); - lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", - ctx->voidt, &kill, 1, 0); + bool use_kill = !info->writes_memory; + if (use_kill) { + kill = lp_build_select(&bld_base->base, can_emit, + lp_build_const_float(gallivm, 1.0f), + lp_build_const_float(gallivm, -1.0f)); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + ctx->voidt, &kill, 1, 0); + } else { + lp_build_if(&if_state, gallivm, can_emit); + } + offset = 0; for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = - ctx->soa.outputs[i]; + LLVMValueRef *out_ptr = ctx->outputs[i]; for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan)) || + ((info->output_streams[i] >> (2 * chan)) & 3) != stream) + continue; + LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); LLVMValueRef voffset = - lp_build_const_int32(gallivm, (i * 4 + chan) * + lp_build_const_int32(gallivm, offset * shader->selector->gs_max_out_vertices); + offset++; voffset = lp_build_add(uint, voffset, gs_next_vertex); voffset = lp_build_mul_imm(uint, voffset, 4); out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); - build_tbuffer_store(ctx, - ctx->gsvs_ring[stream], - out_val, 1, - voffset, soffset, 0, - V_008F0C_BUF_DATA_FORMAT_32, - V_008F0C_BUF_NUM_FORMAT_UINT, - 1, 0, 1, 1, 0); + ac_build_tbuffer_store(&ctx->ac, + ctx->gsvs_ring[stream], + out_val, 1, + voffset, soffset, 0, + V_008F0C_BUF_DATA_FORMAT_32, + V_008F0C_BUF_NUM_FORMAT_UINT, + 1, 0, 1, 1, 0); } } + gs_next_vertex = lp_build_add(uint, gs_next_vertex, lp_build_const_int32(gallivm, 1)); LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]); /* Signal vertex emission */ - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8)); - args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); - lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", - ctx->voidt, args, 2, 0); + ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), + LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); + if (!use_kill) + lp_build_endif(&if_state); } /* Cut one primitive from the geometry shader */ @@ -5321,16 +5095,12 @@ static void si_llvm_emit_primitive( struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMValueRef args[2]; unsigned stream; /* Signal primitive cut */ stream = si_llvm_get_stream(bld_base, emit_data); - args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8)); - args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); - lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", - ctx->voidt, args, 2, 0); + ac_emit_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), + LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); } static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, @@ -5340,11 +5110,14 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - /* The real barrier instruction isn’t needed, because an entire patch + /* SI only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch * always fits into a single wave. */ - if (ctx->type == PIPE_SHADER_TESS_CTRL) { - emit_optimization_barrier(ctx); + if (HAVE_LLVM >= 0x0309 && + ctx->screen->b.chip_class == SI && + ctx->type == PIPE_SHADER_TESS_CTRL) { + emit_waitcnt(ctx, LGKM_CNT & VM_CNT); return; } @@ -5365,13 +5138,14 @@ static const struct lp_build_tgsi_action interp_action = { }; static void si_create_function(struct si_shader_context *ctx, + const char *name, LLVMTypeRef *returns, unsigned num_returns, LLVMTypeRef *params, unsigned num_params, int last_sgpr) { int i; - si_llvm_create_func(ctx, returns, num_returns, + si_llvm_create_func(ctx, name, returns, num_returns, params, num_params); si_llvm_shader_type(ctx->main_fn, ctx->type); ctx->return_value = LLVMGetUndef(ctx->return_type); @@ -5387,10 +5161,10 @@ static void si_create_function(struct si_shader_context *ctx, * SGPR spilling significantly. */ if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { - LLVMAddAttribute(P, LLVMByValAttribute); + lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL); lp_add_attr_dereferenceable(P, UINT64_MAX); } else - LLVMAddAttribute(P, LLVMInRegAttribute); + lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG); } if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) { @@ -5410,20 +5184,6 @@ static void si_create_function(struct si_shader_context *ctx, } } -static void create_meta_data(struct si_shader_context *ctx) -{ - struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm; - - ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context, - "invariant.load", 14); - ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context, - "range", 5); - ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context, - "amdgpu.uniform", 14); - - ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0); -} - static void declare_streamout_params(struct si_shader_context *ctx, struct pipe_stream_output_info *so, LLVMTypeRef *params, LLVMTypeRef i32, @@ -5436,7 +5196,7 @@ static void declare_streamout_params(struct si_shader_context *ctx, if (ctx->type != PIPE_SHADER_TESS_EVAL) params[ctx->param_streamout_config = (*num_params)++] = i32; else - ctx->param_streamout_config = ctx->param_tess_offchip; + ctx->param_streamout_config = *num_params - 1; params[ctx->param_streamout_write_index = (*num_params)++] = i32; } @@ -5463,6 +5223,9 @@ static unsigned llvm_get_type_size(LLVMTypeRef type) case LLVMVectorTypeKind: return LLVMGetVectorSize(type) * llvm_get_type_size(LLVMGetElementType(type)); + case LLVMArrayTypeKind: + return LLVMGetArrayLength(type) * + llvm_get_type_size(LLVMGetElementType(type)); default: assert(0); return 0; @@ -5472,7 +5235,7 @@ static unsigned llvm_get_type_size(LLVMTypeRef type) static void declare_tess_lds(struct si_shader_context *ctx) { struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; struct lp_build_context *uint = &bld_base->uint_bld; unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768; @@ -5481,15 +5244,33 @@ static void declare_tess_lds(struct si_shader_context *ctx) "tess_lds"); } +static unsigned si_get_max_workgroup_size(struct si_shader *shader) +{ + const unsigned *properties = shader->selector->info.properties; + unsigned max_work_group_size = + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; + + if (!max_work_group_size) { + /* This is a variable group size compute shader, + * compile it for the maximum possible group size. + */ + max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } + return max_work_group_size; +} + static void create_function(struct si_shader_context *ctx) { - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; LLVMTypeRef returns[16+32*4]; unsigned i, last_sgpr, num_params, num_return_sgprs; unsigned num_returns = 0; + unsigned num_prolog_vgprs = 0; v3i32 = LLVMVectorType(ctx->i32, 3); @@ -5507,13 +5288,13 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_DRAWID] = ctx->i32; num_params = SI_PARAM_DRAWID+1; - if (shader->key.vs.as_es) { + if (shader->key.as_es) { params[ctx->param_es2gs_offset = num_params++] = ctx->i32; - } else if (shader->key.vs.as_ls) { + } else if (shader->key.as_ls) { params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32; num_params = SI_PARAM_LS_OUT_LAYOUT+1; } else { - if (ctx->is_gs_copy_shader) { + if (shader->is_gs_copy_shader) { num_params = SI_PARAM_RW_BUFFERS+1; } else { params[SI_PARAM_VS_STATE_BITS] = ctx->i32; @@ -5533,19 +5314,17 @@ static void create_function(struct si_shader_context *ctx) params[ctx->param_vs_prim_id = num_params++] = ctx->i32; params[ctx->param_instance_id = num_params++] = ctx->i32; - if (!ctx->no_prolog && - !ctx->is_gs_copy_shader) { + if (!shader->is_gs_copy_shader) { /* Vertex load indices. */ ctx->param_vertex_index0 = num_params; for (i = 0; i < shader->selector->info.num_inputs; i++) params[num_params++] = ctx->i32; - } - if (!ctx->no_epilog && - !ctx->is_gs_copy_shader) { + num_prolog_vgprs += shader->selector->info.num_inputs; + /* PrimitiveID output. */ - if (!shader->key.vs.as_es && !shader->key.vs.as_ls) + if (!shader->key.as_es && !shader->key.as_ls) for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) returns[num_returns++] = ctx->f32; } @@ -5565,28 +5344,26 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_REL_IDS] = ctx->i32; num_params = SI_PARAM_REL_IDS+1; - if (!ctx->no_epilog) { - /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are - * placed after the user SGPRs. - */ - for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++) - returns[num_returns++] = ctx->i32; /* SGPRs */ + /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are + * placed after the user SGPRs. + */ + for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 3; i++) - returns[num_returns++] = ctx->f32; /* VGPRs */ - } + for (i = 0; i < 3; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ break; case PIPE_SHADER_TESS_EVAL: params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1; - if (shader->key.tes.as_es) { + if (shader->key.as_es) { params[ctx->param_oc_lds = num_params++] = ctx->i32; - params[ctx->param_tess_offchip = num_params++] = ctx->i32; + params[num_params++] = ctx->i32; params[ctx->param_es2gs_offset = num_params++] = ctx->i32; } else { - params[ctx->param_tess_offchip = num_params++] = ctx->i32; + params[num_params++] = ctx->i32; declare_streamout_params(ctx, &shader->selector->so, params, ctx->i32, &num_params); params[ctx->param_oc_lds = num_params++] = ctx->i32; @@ -5600,7 +5377,7 @@ static void create_function(struct si_shader_context *ctx) params[ctx->param_tes_patch_id = num_params++] = ctx->i32; /* PrimitiveID output. */ - if (!ctx->no_epilog && !shader->key.tes.as_es) + if (!shader->key.as_es) for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) returns[num_returns++] = ctx->f32; break; @@ -5639,43 +5416,42 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_POS_Z_FLOAT] = ctx->f32; params[SI_PARAM_POS_W_FLOAT] = ctx->f32; params[SI_PARAM_FRONT_FACE] = ctx->i32; + shader->info.face_vgpr_index = 20; params[SI_PARAM_ANCILLARY] = ctx->i32; params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; params[SI_PARAM_POS_FIXED_PT] = ctx->i32; num_params = SI_PARAM_POS_FIXED_PT+1; - if (!ctx->no_prolog) { - /* Color inputs from the prolog. */ - if (shader->selector->info.colors_read) { - unsigned num_color_elements = - util_bitcount(shader->selector->info.colors_read); + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = + util_bitcount(shader->selector->info.colors_read); - assert(num_params + num_color_elements <= ARRAY_SIZE(params)); - for (i = 0; i < num_color_elements; i++) - params[num_params++] = ctx->f32; - } - } + assert(num_params + num_color_elements <= ARRAY_SIZE(params)); + for (i = 0; i < num_color_elements; i++) + params[num_params++] = ctx->f32; - if (!ctx->no_epilog) { - /* Outputs for the epilog. */ - num_return_sgprs = SI_SGPR_ALPHA_REF + 1; - num_returns = - num_return_sgprs + - util_bitcount(shader->selector->info.colors_written) * 4 + - shader->selector->info.writes_z + - shader->selector->info.writes_stencil + - shader->selector->info.writes_samplemask + - 1 /* SampleMaskIn */; - - num_returns = MAX2(num_returns, - num_return_sgprs + - PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); - - for (i = 0; i < num_return_sgprs; i++) - returns[i] = ctx->i32; - for (; i < num_returns; i++) - returns[i] = ctx->f32; + num_prolog_vgprs += num_color_elements; } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->i32; + for (; i < num_returns; i++) + returns[i] = ctx->f32; break; case PIPE_SHADER_COMPUTE: @@ -5694,7 +5470,7 @@ static void create_function(struct si_shader_context *ctx) assert(num_params <= ARRAY_SIZE(params)); - si_create_function(ctx, returns, num_returns, params, + si_create_function(ctx, "main", returns, num_returns, params, num_params, last_sgpr); /* Reserve register locations for VGPR inputs the PS prolog may need. */ @@ -5711,22 +5487,9 @@ static void create_function(struct si_shader_context *ctx) S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); } else if (ctx->type == PIPE_SHADER_COMPUTE) { - const unsigned *properties = shader->selector->info.properties; - unsigned max_work_group_size = - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - - if (!max_work_group_size) { - /* This is a variable group size compute shader, - * compile it for the maximum possible group size. - */ - max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - } - si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size", - max_work_group_size); + si_get_max_workgroup_size(shader)); } shader->info.num_input_sgprs = 0; @@ -5735,12 +5498,11 @@ static void create_function(struct si_shader_context *ctx) for (i = 0; i <= last_sgpr; ++i) shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4; - /* Unused fragment shader inputs are eliminated by the compiler, - * so we don't know yet how many there will be. - */ - if (ctx->type != PIPE_SHADER_FRAGMENT) - for (; i < num_params; ++i) - shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4; + for (; i < num_params; ++i) + shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4; + + assert(shader->info.num_input_vgprs >= num_prolog_vgprs); + shader->info.num_input_vgprs -= num_prolog_vgprs; if (!ctx->screen->has_ds_bpermute && bld_base->info && @@ -5756,9 +5518,8 @@ static void create_function(struct si_shader_context *ctx) "ddxy_lds", LOCAL_ADDR_SPACE); - if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) || - ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_TESS_EVAL) + if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) || + ctx->type == PIPE_SHADER_TESS_CTRL) declare_tess_lds(ctx); } @@ -5768,16 +5529,16 @@ static void create_function(struct si_shader_context *ctx) */ static void preload_ring_buffers(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = - ctx->soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; + LLVMBuilderRef builder = gallivm->builder; LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); if ((ctx->type == PIPE_SHADER_VERTEX && - ctx->shader->key.vs.as_es) || + ctx->shader->key.as_es) || (ctx->type == PIPE_SHADER_TESS_EVAL && - ctx->shader->key.tes.as_es) || + ctx->shader->key.as_es) || ctx->type == PIPE_SHADER_GEOMETRY) { unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS @@ -5785,22 +5546,84 @@ static void preload_ring_buffers(struct si_shader_context *ctx) LLVMValueRef offset = lp_build_const_int32(gallivm, ring); ctx->esgs_ring = - build_indexed_load_const(ctx, buf_ptr, offset); + ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); } - if (ctx->is_gs_copy_shader) { - LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS); + if (ctx->shader->is_gs_copy_shader) { + LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); ctx->gsvs_ring[0] = - build_indexed_load_const(ctx, buf_ptr, offset); - } - if (ctx->type == PIPE_SHADER_GEOMETRY) { - int i; - for (i = 0; i < 4; i++) { - LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i); + ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + } else if (ctx->type == PIPE_SHADER_GEOMETRY) { + const struct si_shader_selector *sel = ctx->shader->selector; + struct lp_build_context *uint = &ctx->bld_base.uint_bld; + LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); + LLVMValueRef base_ring; + + base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + + /* The conceptual layout of the GSVS ring is + * v0c0 .. vLv0 v0c1 .. vLc1 .. + * but the real memory layout is swizzled across + * threads: + * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL + * t16v0c0 .. + * Override the buffer descriptor accordingly. + */ + LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); + uint64_t stream_offset = 0; + + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + unsigned stride; + unsigned num_records; + LLVMValueRef ring, tmp; + + num_components = sel->info.num_stream_output_components[stream]; + if (!num_components) + continue; - ctx->gsvs_ring[i] = - build_indexed_load_const(ctx, buf_ptr, offset); + stride = 4 * num_components * sel->gs_max_out_vertices; + + /* Limit on the stride field for <= CIK. */ + assert(stride < (1 << 14)); + + num_records = 64; + + ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); + tmp = LLVMBuildExtractElement(builder, ring, uint->zero, ""); + tmp = LLVMBuildAdd(builder, tmp, + LLVMConstInt(ctx->i64, + stream_offset, 0), ""); + stream_offset += stride * 64; + + ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, ""); + ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); + tmp = LLVMBuildExtractElement(builder, ring, uint->one, ""); + tmp = LLVMBuildOr(builder, tmp, + LLVMConstInt(ctx->i32, + S_008F04_STRIDE(stride) | + S_008F04_SWIZZLE_ENABLE(1), 0), ""); + ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, ""); + ring = LLVMBuildInsertElement(builder, ring, + LLVMConstInt(ctx->i32, num_records, 0), + LLVMConstInt(ctx->i32, 2, 0), ""); + ring = LLVMBuildInsertElement(builder, ring, + LLVMConstInt(ctx->i32, + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */ + S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ + S_008F0C_ADD_TID_ENABLE(1), + 0), + LLVMConstInt(ctx->i32, 3, 0), ""); + ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, ""); + + ctx->gsvs_ring[stream] = ring; } } } @@ -5809,8 +5632,7 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, LLVMValueRef param_rw_buffers, unsigned param_pos_fixed_pt) { - struct lp_build_tgsi_context *bld_base = - &ctx->soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef slot, desc, offset, row, bit, address[2]; @@ -5824,7 +5646,7 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, /* Load the buffer descriptor. */ slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE); - desc = build_indexed_load_const(ctx, param_rw_buffers, slot); + desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot); /* The stipple pattern is 32x32, each row has 32 bits. */ offset = LLVMBuildMul(builder, address[1], @@ -5984,7 +5806,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) r600_resource_reference(&shader->bo, NULL); shader->bo = (struct r600_resource*) pipe_buffer_create(&sscreen->b.b, 0, - PIPE_USAGE_IMMUTABLE, bo_size); + PIPE_USAGE_IMMUTABLE, + align(bo_size, SI_CPDMA_ALIGNMENT)); if (!shader->bo) return -ENOMEM; @@ -6058,19 +5881,22 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary } static void si_shader_dump_stats(struct si_screen *sscreen, - struct si_shader_config *conf, - unsigned num_inputs, - unsigned code_size, + struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, - FILE *file) + FILE *file, + bool check_debug_option) { + struct si_shader_config *conf = &shader->config; + unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0; + unsigned code_size = si_get_shader_binary_size(shader); unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256; unsigned lds_per_wave = 0; unsigned max_simd_waves = 10; /* Compute LDS usage for PS. */ - if (processor == PIPE_SHADER_FRAGMENT) { + switch (processor) { + case PIPE_SHADER_FRAGMENT: /* The minimum usage per wave is (num_inputs * 48). The maximum * usage is (num_inputs * 48 * 16). * We can get anything in between and it varies between waves. @@ -6083,6 +5909,15 @@ static void si_shader_dump_stats(struct si_screen *sscreen, */ lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment); + break; + case PIPE_SHADER_COMPUTE: + if (shader->selector) { + unsigned max_workgroup_size = + si_get_max_workgroup_size(shader); + lds_per_wave = (conf->lds_size * lds_increment) / + DIV_ROUND_UP(max_workgroup_size, 64); + } + break; } /* Compute the per-SIMD wave counts. */ @@ -6096,13 +5931,12 @@ static void si_shader_dump_stats(struct si_screen *sscreen, if (conf->num_vgprs) max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); - /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD - * that PS can use. - */ + /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above + * 16KB makes some SIMDs unoccupied). */ if (lds_per_wave) max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); - if (file != stderr || + if (!check_debug_option || r600_can_dump_shader(&sscreen->b, processor)) { if (processor == PIPE_SHADER_FRAGMENT) { fprintf(file, "*** SHADER CONFIG ***\n" @@ -6116,13 +5950,15 @@ static void si_shader_dump_stats(struct si_screen *sscreen, "VGPRS: %d\n" "Spilled SGPRs: %d\n" "Spilled VGPRs: %d\n" + "Private memory VGPRs: %d\n" "Code Size: %d bytes\n" "LDS: %d blocks\n" "Scratch: %d bytes per wave\n" "Max Waves: %d\n" "********************\n\n\n", conf->num_sgprs, conf->num_vgprs, - conf->spilled_sgprs, conf->spilled_vgprs, code_size, + conf->spilled_sgprs, conf->spilled_vgprs, + conf->private_mem_vgprs, code_size, conf->lds_size, conf->scratch_bytes_per_wave, max_simd_waves); } @@ -6130,33 +5966,32 @@ static void si_shader_dump_stats(struct si_screen *sscreen, pipe_debug_message(debug, SHADER_INFO, "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " - "Spilled VGPRs: %d", + "Spilled VGPRs: %d PrivMem VGPRs: %d", conf->num_sgprs, conf->num_vgprs, code_size, conf->lds_size, conf->scratch_bytes_per_wave, max_simd_waves, conf->spilled_sgprs, - conf->spilled_vgprs); + conf->spilled_vgprs, conf->private_mem_vgprs); } -static const char *si_get_shader_name(struct si_shader *shader, - unsigned processor) +const char *si_get_shader_name(struct si_shader *shader, unsigned processor) { switch (processor) { case PIPE_SHADER_VERTEX: - if (shader->key.vs.as_es) + if (shader->key.as_es) return "Vertex Shader as ES"; - else if (shader->key.vs.as_ls) + else if (shader->key.as_ls) return "Vertex Shader as LS"; else return "Vertex Shader as VS"; case PIPE_SHADER_TESS_CTRL: return "Tessellation Control Shader"; case PIPE_SHADER_TESS_EVAL: - if (shader->key.tes.as_es) + if (shader->key.as_es) return "Tessellation Evaluation Shader as ES"; else return "Tessellation Evaluation Shader as VS"; case PIPE_SHADER_GEOMETRY: - if (shader->gs_copy_shader == NULL) + if (shader->is_gs_copy_shader) return "GS Copy Shader as VS"; else return "Geometry Shader"; @@ -6171,19 +6006,19 @@ static const char *si_get_shader_name(struct si_shader *shader, void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, - FILE *file) + FILE *file, bool check_debug_option) { - if (file != stderr || + if (!check_debug_option || r600_can_dump_shader(&sscreen->b, processor)) si_dump_shader_key(processor, &shader->key, file); - if (file != stderr && shader->binary.llvm_ir_string) { + if (!check_debug_option && shader->binary.llvm_ir_string) { fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader, processor)); fprintf(file, "%s\n", shader->binary.llvm_ir_string); } - if (file != stderr || + if (!check_debug_option || (r600_can_dump_shader(&sscreen->b, processor) && !(sscreen->b.debug_flags & DBG_NO_ASM))) { fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor)); @@ -6200,10 +6035,8 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, fprintf(file, "\n"); } - si_shader_dump_stats(sscreen, &shader->config, - shader->selector ? shader->selector->info.num_inputs : 0, - si_get_shader_binary_size(shader), debug, processor, - file); + si_shader_dump_stats(sscreen, shader, debug, processor, file, + check_debug_option); } int si_compile_llvm(struct si_screen *sscreen, @@ -6223,7 +6056,7 @@ int si_compile_llvm(struct si_screen *sscreen, if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) { fprintf(stderr, "%s LLVM IR:\n\n", name); - LLVMDumpModule(mod); + ac_dump_module(mod); fprintf(stderr, "\n"); } } @@ -6285,33 +6118,50 @@ static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) } /* Generate code for the hardware VS shader stage to go with a geometry shader */ -static int si_generate_gs_copy_shader(struct si_screen *sscreen, - struct si_shader_context *ctx, - struct si_shader *gs, - struct pipe_debug_callback *debug) +struct si_shader * +si_generate_gs_copy_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug) { - struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; + struct si_shader_context ctx; + struct si_shader *shader; + struct gallivm_state *gallivm = &ctx.gallivm; + LLVMBuilderRef builder; + struct lp_build_tgsi_context *bld_base = &ctx.bld_base; struct lp_build_context *uint = &bld_base->uint_bld; struct si_shader_output_values *outputs; - struct tgsi_shader_info *gsinfo = &gs->selector->info; + struct tgsi_shader_info *gsinfo = &gs_selector->info; LLVMValueRef args[9]; int i, r; outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); - si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm); - ctx->type = PIPE_SHADER_VERTEX; - ctx->is_gs_copy_shader = true; + if (!outputs) + return NULL; - create_meta_data(ctx); - create_function(ctx); - preload_ring_buffers(ctx); + shader = CALLOC_STRUCT(si_shader); + if (!shader) { + FREE(outputs); + return NULL; + } + + + shader->selector = gs_selector; + shader->is_gs_copy_shader = true; - args[0] = ctx->gsvs_ring[0]; + si_init_shader_ctx(&ctx, sscreen, shader, tm); + ctx.type = PIPE_SHADER_VERTEX; + + builder = gallivm->builder; + + create_function(&ctx); + preload_ring_buffers(&ctx); + + args[0] = ctx.gsvs_ring[0]; args[1] = lp_build_mul_imm(uint, - LLVMGetParam(ctx->main_fn, - ctx->param_vertex_id), + LLVMGetParam(ctx.main_fn, + ctx.param_vertex_id), 4); args[3] = uint->zero; args[4] = uint->one; /* OFFEN */ @@ -6320,60 +6170,120 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, args[7] = uint->one; /* SLC */ args[8] = uint->zero; /* TFE */ - /* Fetch vertex data from GSVS ring */ - for (i = 0; i < gsinfo->num_outputs; ++i) { - unsigned chan; + /* Fetch the vertex stream ID.*/ + LLVMValueRef stream_id; - outputs[i].name = gsinfo->output_semantic_name[i]; - outputs[i].sid = gsinfo->output_semantic_index[i]; + if (gs_selector->so.num_outputs) + stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2); + else + stream_id = uint->zero; - for (chan = 0; chan < 4; chan++) { - args[2] = lp_build_const_int32(gallivm, - (i * 4 + chan) * - gs->selector->gs_max_out_vertices * 16 * 4); + /* Fill in output information. */ + for (i = 0; i < gsinfo->num_outputs; ++i) { + outputs[i].semantic_name = gsinfo->output_semantic_name[i]; + outputs[i].semantic_index = gsinfo->output_semantic_index[i]; - outputs[i].values[chan] = - LLVMBuildBitCast(gallivm->builder, - lp_build_intrinsic(gallivm->builder, - "llvm.SI.buffer.load.dword.i32.i32", - ctx->i32, args, 9, - LLVMReadOnlyAttribute), - ctx->f32, ""); + for (int chan = 0; chan < 4; chan++) { + outputs[i].vertex_stream[chan] = + (gsinfo->output_streams[i] >> (2 * chan)) & 3; } } - si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); + LLVMBasicBlockRef end_bb; + LLVMValueRef switch_inst; - LLVMBuildRetVoid(gallivm->builder); + end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end"); + switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); - /* Dump LLVM IR before any optimization passes */ - if (sscreen->b.debug_flags & DBG_PREOPT_IR && - r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) - LLVMDumpModule(bld_base->base.gallivm->module); + for (int stream = 0; stream < 4; stream++) { + LLVMBasicBlockRef bb; + unsigned offset; - si_llvm_finalize_module(ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY)); + if (!gsinfo->num_stream_output_components[stream]) + continue; - r = si_compile_llvm(sscreen, &ctx->shader->binary, - &ctx->shader->config, ctx->tm, - bld_base->base.gallivm->module, - debug, PIPE_SHADER_GEOMETRY, - "GS Copy Shader"); - if (!r) { - if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) + if (stream > 0 && !gs_selector->so.num_outputs) + continue; + + bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out"); + LLVMAddCase(switch_inst, lp_build_const_int32(gallivm, stream), bb); + LLVMPositionBuilderAtEnd(builder, bb); + + /* Fetch vertex data from GSVS ring */ + offset = 0; + for (i = 0; i < gsinfo->num_outputs; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + if (!(gsinfo->output_usagemask[i] & (1 << chan)) || + outputs[i].vertex_stream[chan] != stream) { + outputs[i].values[chan] = ctx.bld_base.base.undef; + continue; + } + + args[2] = lp_build_const_int32( + gallivm, + offset * gs_selector->gs_max_out_vertices * 16 * 4); + offset++; + + outputs[i].values[chan] = + LLVMBuildBitCast(gallivm->builder, + lp_build_intrinsic(gallivm->builder, + "llvm.SI.buffer.load.dword.i32.i32", + ctx.i32, args, 9, + LP_FUNC_ATTR_READONLY), + ctx.f32, ""); + } + } + + /* Streamout and exports. */ + if (gs_selector->so.num_outputs) { + si_llvm_emit_streamout(&ctx, outputs, + gsinfo->num_outputs, + stream); + } + + if (stream == 0) + si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); + + LLVMBuildBr(builder, end_bb); + } + + LLVMPositionBuilderAtEnd(builder, end_bb); + + LLVMBuildRetVoid(gallivm->builder); + + /* Dump LLVM IR before any optimization passes */ + if (sscreen->b.debug_flags & DBG_PREOPT_IR && + r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) + ac_dump_module(bld_base->base.gallivm->module); + + si_llvm_finalize_module(&ctx, + r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY)); + + r = si_compile_llvm(sscreen, &ctx.shader->binary, + &ctx.shader->config, ctx.tm, + bld_base->base.gallivm->module, + debug, PIPE_SHADER_GEOMETRY, + "GS Copy Shader"); + if (!r) { + if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) fprintf(stderr, "GS Copy Shader:\n"); - si_shader_dump(sscreen, ctx->shader, debug, - PIPE_SHADER_GEOMETRY, stderr); - r = si_shader_binary_upload(sscreen, ctx->shader); + si_shader_dump(sscreen, ctx.shader, debug, + PIPE_SHADER_GEOMETRY, stderr, true); + r = si_shader_binary_upload(sscreen, ctx.shader); } - si_llvm_dispose(ctx); + si_llvm_dispose(&ctx); FREE(outputs); - return r; + + if (r != 0) { + FREE(shader); + shader = NULL; + } + return shader; } -static void si_dump_shader_key(unsigned shader, union si_shader_key *key, +static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, FILE *f) { int i; @@ -6382,51 +6292,69 @@ static void si_dump_shader_key(unsigned shader, union si_shader_key *key, switch (shader) { case PIPE_SHADER_VERTEX: - fprintf(f, " instance_divisors = {"); - for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++) + fprintf(f, " part.vs.prolog.instance_divisors = {"); + for (i = 0; i < ARRAY_SIZE(key->part.vs.prolog.instance_divisors); i++) fprintf(f, !i ? "%u" : ", %u", - key->vs.prolog.instance_divisors[i]); + key->part.vs.prolog.instance_divisors[i]); + fprintf(f, "}\n"); + fprintf(f, " part.vs.epilog.export_prim_id = %u\n", key->part.vs.epilog.export_prim_id); + fprintf(f, " as_es = %u\n", key->as_es); + fprintf(f, " as_ls = %u\n", key->as_ls); + + fprintf(f, " mono.vs.fix_fetch = {"); + for (i = 0; i < SI_MAX_ATTRIBS; i++) + fprintf(f, !i ? "%u" : ", %u", key->mono.vs.fix_fetch[i]); fprintf(f, "}\n"); - fprintf(f, " as_es = %u\n", key->vs.as_es); - fprintf(f, " as_ls = %u\n", key->vs.as_ls); - fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id); break; case PIPE_SHADER_TESS_CTRL: - fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode); + fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); + fprintf(f, " mono.tcs.inputs_to_copy = 0x%"PRIx64"\n", key->mono.tcs.inputs_to_copy); break; case PIPE_SHADER_TESS_EVAL: - fprintf(f, " as_es = %u\n", key->tes.as_es); - fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id); + fprintf(f, " part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id); + fprintf(f, " as_es = %u\n", key->as_es); break; case PIPE_SHADER_GEOMETRY: + fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); + break; + case PIPE_SHADER_COMPUTE: break; case PIPE_SHADER_FRAGMENT: - fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side); - fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors); - fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple); - fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp); - fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp); - fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp); - fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp); - fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp); - fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear); - fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format); - fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8); - fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf); - fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func); - fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one); - fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing); - fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color); + fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); + fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); + fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); + fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); + fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); + fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); + fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); + fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); + fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); + fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); + fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); + fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); + fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); + fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); + fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); + fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); break; default: assert(0); } + + if ((shader == PIPE_SHADER_GEOMETRY || + shader == PIPE_SHADER_TESS_EVAL || + shader == PIPE_SHADER_VERTEX) && + !key->as_es && !key->as_ls) { + fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs); + fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2); + fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable); + } } static void si_init_shader_ctx(struct si_shader_context *ctx, @@ -6437,34 +6365,11 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, struct lp_build_tgsi_context *bld_base; struct lp_build_tgsi_action tmpl = {}; - memset(ctx, 0, sizeof(*ctx)); - si_llvm_context_init( - ctx, "amdgcn--", + si_llvm_context_init(ctx, sscreen, shader, tm, (shader && shader->selector) ? &shader->selector->info : NULL, (shader && shader->selector) ? shader->selector->tokens : NULL); - si_shader_context_init_alu(&ctx->soa.bld_base); - ctx->tm = tm; - ctx->screen = sscreen; - if (shader && shader->selector) - ctx->type = shader->selector->info.processor; - else - ctx->type = -1; - ctx->shader = shader; - - ctx->voidt = LLVMVoidTypeInContext(ctx->gallivm.context); - ctx->i1 = LLVMInt1TypeInContext(ctx->gallivm.context); - ctx->i8 = LLVMInt8TypeInContext(ctx->gallivm.context); - ctx->i32 = LLVMInt32TypeInContext(ctx->gallivm.context); - ctx->i64 = LLVMInt64TypeInContext(ctx->gallivm.context); - ctx->i128 = LLVMIntTypeInContext(ctx->gallivm.context, 128); - ctx->f32 = LLVMFloatTypeInContext(ctx->gallivm.context); - ctx->v16i8 = LLVMVectorType(ctx->i8, 16); - ctx->v2i32 = LLVMVectorType(ctx->i32, 2); - ctx->v4i32 = LLVMVectorType(ctx->i32, 4); - ctx->v4f32 = LLVMVectorType(ctx->f32, 4); - ctx->v8i32 = LLVMVectorType(ctx->i32, 8); - - bld_base = &ctx->soa.bld_base; + + bld_base = &ctx->bld_base; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; @@ -6596,9 +6501,10 @@ static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx) exports.num = 0; - if ((ctx->type == PIPE_SHADER_VERTEX && - (shader->key.vs.as_es || shader->key.vs.as_ls)) || - (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.tes.as_es)) + if (ctx->type == PIPE_SHADER_FRAGMENT || + ctx->type == PIPE_SHADER_COMPUTE || + shader->key.as_es || + shader->key.as_ls) return; /* Process all LLVM instructions. */ @@ -6679,18 +6585,44 @@ static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx) } } +static void si_count_scratch_private_memory(struct si_shader_context *ctx) +{ + ctx->shader->config.private_mem_vgprs = 0; + + /* Process all LLVM instructions. */ + LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn); + while (bb) { + LLVMValueRef next = LLVMGetFirstInstruction(bb); + + while (next) { + LLVMValueRef inst = next; + next = LLVMGetNextInstruction(next); + + if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) + continue; + + LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); + /* No idea why LLVM aligns allocas to 4 elements. */ + unsigned alignment = LLVMGetAlignment(inst); + unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); + ctx->shader->config.private_mem_vgprs += dw_size; + } + bb = LLVMGetNextBasicBlock(bb); + } +} + static bool si_compile_tgsi_main(struct si_shader_context *ctx, struct si_shader *shader) { struct si_shader_selector *sel = shader->selector; - struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; switch (ctx->type) { case PIPE_SHADER_VERTEX: ctx->load_input = declare_input_vs; - if (shader->key.vs.as_ls) + if (shader->key.as_ls) bld_base->emit_epilogue = si_llvm_emit_ls_epilogue; - else if (shader->key.vs.as_es) + else if (shader->key.as_es) bld_base->emit_epilogue = si_llvm_emit_es_epilogue; else bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; @@ -6703,7 +6635,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, break; case PIPE_SHADER_TESS_EVAL: bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; - if (shader->key.tes.as_es) + if (shader->key.as_es) bld_base->emit_epilogue = si_llvm_emit_es_epilogue; else bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; @@ -6714,10 +6646,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, break; case PIPE_SHADER_FRAGMENT: ctx->load_input = declare_input_fs; - if (ctx->no_epilog) - bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; - else - bld_base->emit_epilogue = si_llvm_return_fs_outputs; + bld_base->emit_epilogue = si_llvm_return_fs_outputs; break; case PIPE_SHADER_COMPUTE: ctx->declare_memory_region = declare_compute_memory; @@ -6727,18 +6656,9 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, return false; } - create_meta_data(ctx); create_function(ctx); preload_ring_buffers(ctx); - if (ctx->no_prolog && sel->type == PIPE_SHADER_FRAGMENT && - shader->key.ps.prolog.poly_stipple) { - LLVMValueRef list = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - si_llvm_emit_polygon_stipple(ctx, list, - SI_PARAM_POS_FIXED_PT); - } - if (ctx->type == PIPE_SHADER_GEOMETRY) { int i; for (i = 0; i < 4; i++) { @@ -6757,6 +6677,464 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, return true; } +/** + * Compute the VS prolog key, which contains all the information needed to + * build the VS prolog function, and set shader->info bits where needed. + */ +static void si_get_vs_prolog_key(struct si_shader *shader, + union si_shader_part_key *key) +{ + struct tgsi_shader_info *info = &shader->selector->info; + + memset(key, 0, sizeof(*key)); + key->vs_prolog.states = shader->key.part.vs.prolog; + key->vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; + key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; + + /* Set the instanceID flag. */ + for (unsigned i = 0; i < info->num_inputs; i++) + if (key->vs_prolog.states.instance_divisors[i]) + shader->info.uses_instanceid = true; +} + +/** + * Compute the VS epilog key, which contains all the information needed to + * build the VS epilog function, and set the PrimitiveID output offset. + */ +static void si_get_vs_epilog_key(struct si_shader *shader, + struct si_vs_epilog_bits *states, + union si_shader_part_key *key) +{ + memset(key, 0, sizeof(*key)); + key->vs_epilog.states = *states; + + /* Set up the PrimitiveID output. */ + if (shader->key.part.vs.epilog.export_prim_id) { + unsigned index = shader->selector->info.num_outputs; + unsigned offset = shader->info.nr_param_exports++; + + key->vs_epilog.prim_id_param_offset = offset; + assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[index] = offset; + } +} + +/** + * Compute the PS prolog key, which contains all the information needed to + * build the PS prolog function, and set related bits in shader->config. + */ +static void si_get_ps_prolog_key(struct si_shader *shader, + union si_shader_part_key *key, + bool separate_prolog) +{ + struct tgsi_shader_info *info = &shader->selector->info; + + memset(key, 0, sizeof(*key)); + key->ps_prolog.states = shader->key.part.ps.prolog; + key->ps_prolog.colors_read = info->colors_read; + key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; + key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; + key->ps_prolog.wqm = info->uses_derivatives && + (key->ps_prolog.colors_read || + key->ps_prolog.states.force_persp_sample_interp || + key->ps_prolog.states.force_linear_sample_interp || + key->ps_prolog.states.force_persp_center_interp || + key->ps_prolog.states.force_linear_center_interp || + key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear); + + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; + + if (shader->key.part.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + key->ps_prolog.num_interp_inputs = info->num_inputs; + key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + } + + for (unsigned i = 0; i < 2; i++) { + unsigned interp = info->input_interpolate[color[i]]; + unsigned location = info->input_interpolate_loc[color[i]]; + + if (!(info->colors_read & (0xf << i*4))) + continue; + + key->ps_prolog.color_attr_index[i] = color[i]; + + if (shader->key.part.ps.prolog.flatshade_colors && + interp == TGSI_INTERPOLATE_COLOR) + interp = TGSI_INTERPOLATE_CONSTANT; + + switch (interp) { + case TGSI_INTERPOLATE_CONSTANT: + key->ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + /* Force the interpolation location for colors here. */ + if (shader->key.part.ps.prolog.force_persp_sample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + if (shader->key.part.ps.prolog.force_persp_center_interp) + location = TGSI_INTERPOLATE_LOC_CENTER; + + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + key->ps_prolog.color_interp_vgpr_index[i] = 0; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + key->ps_prolog.color_interp_vgpr_index[i] = 2; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + key->ps_prolog.color_interp_vgpr_index[i] = 4; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + /* Force the interpolation location for colors here. */ + if (shader->key.part.ps.prolog.force_linear_sample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + if (shader->key.part.ps.prolog.force_linear_center_interp) + location = TGSI_INTERPOLATE_LOC_CENTER; + + /* The VGPR assignment for non-monolithic shaders + * works because InitialPSInputAddr is set on the + * main shader and PERSP_PULL_MODEL is never used. + */ + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + key->ps_prolog.color_interp_vgpr_index[i] = + separate_prolog ? 6 : 9; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + key->ps_prolog.color_interp_vgpr_index[i] = + separate_prolog ? 8 : 11; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + key->ps_prolog.color_interp_vgpr_index[i] = + separate_prolog ? 10 : 13; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + default: + assert(0); + } + } + } +} + +/** + * Check whether a PS prolog is required based on the key. + */ +static bool si_need_ps_prolog(const union si_shader_part_key *key) +{ + return key->ps_prolog.colors_read || + key->ps_prolog.states.force_persp_sample_interp || + key->ps_prolog.states.force_linear_sample_interp || + key->ps_prolog.states.force_persp_center_interp || + key->ps_prolog.states.force_linear_center_interp || + key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear || + key->ps_prolog.states.poly_stipple; +} + +/** + * Compute the PS epilog key, which contains all the information needed to + * build the PS epilog function. + */ +static void si_get_ps_epilog_key(struct si_shader *shader, + union si_shader_part_key *key) +{ + struct tgsi_shader_info *info = &shader->selector->info; + memset(key, 0, sizeof(*key)); + key->ps_epilog.colors_written = info->colors_written; + key->ps_epilog.writes_z = info->writes_z; + key->ps_epilog.writes_stencil = info->writes_stencil; + key->ps_epilog.writes_samplemask = info->writes_samplemask; + key->ps_epilog.states = shader->key.part.ps.epilog; +} + +/** + * Build the GS prolog function. Rotate the input vertices for triangle strips + * with adjacency. + */ +static void si_build_gs_prolog_function(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2; + const unsigned num_vgprs = 8; + struct gallivm_state *gallivm = &ctx->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef params[32]; + LLVMTypeRef returns[32]; + LLVMValueRef func, ret; + + for (unsigned i = 0; i < num_sgprs; ++i) { + params[i] = ctx->i32; + returns[i] = ctx->i32; + } + + for (unsigned i = 0; i < num_vgprs; ++i) { + params[num_sgprs + i] = ctx->i32; + returns[num_sgprs + i] = ctx->f32; + } + + /* Create the function. */ + si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, + params, num_sgprs + num_vgprs, num_sgprs - 1); + func = ctx->main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (unsigned i = 0; i < num_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(builder, ret, p, i, ""); + } + for (unsigned i = 0; i < num_vgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); + p = LLVMBuildBitCast(builder, p, ctx->f32, ""); + ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); + } + + if (key->gs_prolog.states.tri_strip_adj_fix) { + /* Remap the input vertices for every other primitive. */ + const unsigned vtx_params[6] = { + num_sgprs, + num_sgprs + 1, + num_sgprs + 3, + num_sgprs + 4, + num_sgprs + 5, + num_sgprs + 6 + }; + LLVMValueRef prim_id, rotate; + + prim_id = LLVMGetParam(func, num_sgprs + 2); + rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); + + for (unsigned i = 0; i < 6; ++i) { + LLVMValueRef base, rotated, actual; + base = LLVMGetParam(func, vtx_params[i]); + rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]); + actual = LLVMBuildSelect(builder, rotate, rotated, base, ""); + actual = LLVMBuildBitCast(builder, actual, ctx->f32, ""); + ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], ""); + } + } + + LLVMBuildRet(builder, ret); +} + +/** + * Given a list of shader part functions, build a wrapper function that + * runs them in sequence to form a monolithic shader. + */ +static void si_build_wrapper_function(struct si_shader_context *ctx, + LLVMValueRef *parts, + unsigned num_parts, + unsigned main_part) +{ + struct gallivm_state *gallivm = &ctx->gallivm; + LLVMBuilderRef builder = ctx->gallivm.builder; + /* PS epilog has one arg per color component */ + LLVMTypeRef param_types[48]; + LLVMValueRef out[48]; + LLVMTypeRef function_type; + unsigned num_params; + unsigned num_out; + MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */ + unsigned num_sgprs, num_vgprs; + unsigned last_sgpr_param; + unsigned gprs; + + for (unsigned i = 0; i < num_parts; ++i) { + lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(parts[i], LLVMPrivateLinkage); + } + + /* The parameters of the wrapper function correspond to those of the + * first part in terms of SGPRs and VGPRs, but we use the types of the + * main part to get the right types. This is relevant for the + * dereferenceable attribute on descriptor table pointers. + */ + num_sgprs = 0; + num_vgprs = 0; + + function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); + num_params = LLVMCountParamTypes(function_type); + + for (unsigned i = 0; i < num_params; ++i) { + LLVMValueRef param = LLVMGetParam(parts[0], i); + + if (ac_is_sgpr_param(param)) { + assert(num_vgprs == 0); + num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4; + } else { + num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4; + } + } + assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types)); + + num_params = 0; + last_sgpr_param = 0; + gprs = 0; + while (gprs < num_sgprs + num_vgprs) { + LLVMValueRef param = LLVMGetParam(parts[main_part], num_params); + unsigned size; + + param_types[num_params] = LLVMTypeOf(param); + if (gprs < num_sgprs) + last_sgpr_param = num_params; + size = llvm_get_type_size(param_types[num_params]) / 4; + num_params++; + + assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); + assert(gprs + size <= num_sgprs + num_vgprs && + (gprs >= num_sgprs || gprs + size <= num_sgprs)); + + gprs += size; + } + + si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param); + + /* Record the arguments of the function as if they were an output of + * a previous part. + */ + num_out = 0; + num_out_sgpr = 0; + + for (unsigned i = 0; i < num_params; ++i) { + LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); + LLVMTypeRef param_type = LLVMTypeOf(param); + LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32; + unsigned size = llvm_get_type_size(param_type) / 4; + + if (size == 1) { + if (param_type != out_type) + param = LLVMBuildBitCast(builder, param, out_type, ""); + out[num_out++] = param; + } else { + LLVMTypeRef vector_type = LLVMVectorType(out_type, size); + + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->i64, ""); + param_type = ctx->i64; + } + + if (param_type != vector_type) + param = LLVMBuildBitCast(builder, param, vector_type, ""); + + for (unsigned j = 0; j < size; ++j) + out[num_out++] = LLVMBuildExtractElement( + builder, param, LLVMConstInt(ctx->i32, j, 0), ""); + } + + if (i <= last_sgpr_param) + num_out_sgpr = num_out; + } + + /* Now chain the parts. */ + for (unsigned part = 0; part < num_parts; ++part) { + LLVMValueRef in[48]; + LLVMValueRef ret; + LLVMTypeRef ret_type; + unsigned out_idx = 0; + + num_params = LLVMCountParams(parts[part]); + assert(num_params <= ARRAY_SIZE(param_types)); + + /* Derive arguments for the next part from outputs of the + * previous one. + */ + for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { + LLVMValueRef param; + LLVMTypeRef param_type; + bool is_sgpr; + unsigned param_size; + LLVMValueRef arg = NULL; + + param = LLVMGetParam(parts[part], param_idx); + param_type = LLVMTypeOf(param); + param_size = llvm_get_type_size(param_type) / 4; + is_sgpr = ac_is_sgpr_param(param); + + if (is_sgpr) { +#if HAVE_LLVM < 0x0400 + LLVMRemoveAttribute(param, LLVMByValAttribute); +#else + unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5); + LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id); +#endif + lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG); + } + + assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); + assert(is_sgpr || out_idx >= num_out_sgpr); + + if (param_size == 1) + arg = out[out_idx]; + else + arg = lp_build_gather_values(gallivm, &out[out_idx], param_size); + + if (LLVMTypeOf(arg) != param_type) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + arg = LLVMBuildBitCast(builder, arg, ctx->i64, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } else { + arg = LLVMBuildBitCast(builder, arg, param_type, ""); + } + } + + in[param_idx] = arg; + out_idx += param_size; + } + + ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); + ret_type = LLVMTypeOf(ret); + + /* Extract the returned GPRs. */ + num_out = 0; + num_out_sgpr = 0; + + if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { + assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); + + unsigned ret_size = LLVMCountStructElementTypes(ret_type); + + for (unsigned i = 0; i < ret_size; ++i) { + LLVMValueRef val = + LLVMBuildExtractValue(builder, ret, i, ""); + + out[num_out++] = val; + + if (LLVMTypeOf(val) == ctx->i32) { + assert(num_out_sgpr + 1 == num_out); + num_out_sgpr = num_out; + } + } + } + } + + LLVMBuildRetVoid(builder); +} + int si_compile_tgsi_shader(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *shader, @@ -6778,16 +7156,14 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, } si_init_shader_ctx(&ctx, sscreen, shader, tm); - ctx.no_prolog = is_monolithic; - ctx.no_epilog = is_monolithic; ctx.separate_prolog = !is_monolithic; - memset(shader->info.vs_output_param_offset, 0xff, + memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED, sizeof(shader->info.vs_output_param_offset)); shader->info.uses_instanceid = sel->info.uses_instanceid; - bld_base = &ctx.soa.bld_base; + bld_base = &ctx.bld_base; ctx.load_system_value = declare_system_value; if (!si_compile_tgsi_main(&ctx, shader)) { @@ -6795,19 +7171,108 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, return -1; } + if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { + LLVMValueRef parts[3]; + bool need_prolog; + bool need_epilog; + + need_prolog = sel->info.num_inputs; + need_epilog = !shader->key.as_es && !shader->key.as_ls; + + parts[need_prolog ? 1 : 0] = ctx.main_fn; + + if (need_prolog) { + union si_shader_part_key prolog_key; + si_get_vs_prolog_key(shader, &prolog_key); + si_build_vs_prolog_function(&ctx, &prolog_key); + parts[0] = ctx.main_fn; + } + + if (need_epilog) { + union si_shader_part_key epilog_key; + si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key); + si_build_vs_epilog_function(&ctx, &epilog_key); + parts[need_prolog ? 2 : 1] = ctx.main_fn; + } + + si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog, + need_prolog ? 1 : 0); + } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { + LLVMValueRef parts[2]; + union si_shader_part_key epilog_key; + + parts[0] = ctx.main_fn; + + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + si_build_tcs_epilog_function(&ctx, &epilog_key); + parts[1] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 0); + } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && + !shader->key.as_es) { + LLVMValueRef parts[2]; + union si_shader_part_key epilog_key; + + parts[0] = ctx.main_fn; + + si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key); + si_build_vs_epilog_function(&ctx, &epilog_key); + parts[1] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 0); + } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { + LLVMValueRef parts[2]; + union si_shader_part_key prolog_key; + + parts[1] = ctx.main_fn; + + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + si_build_gs_prolog_function(&ctx, &prolog_key); + parts[0] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 1); + } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { + LLVMValueRef parts[3]; + union si_shader_part_key prolog_key; + union si_shader_part_key epilog_key; + bool need_prolog; + + si_get_ps_prolog_key(shader, &prolog_key, false); + need_prolog = si_need_ps_prolog(&prolog_key); + + parts[need_prolog ? 1 : 0] = ctx.main_fn; + + if (need_prolog) { + si_build_ps_prolog_function(&ctx, &prolog_key); + parts[0] = ctx.main_fn; + } + + si_get_ps_epilog_key(shader, &epilog_key); + si_build_ps_epilog_function(&ctx, &epilog_key); + parts[need_prolog ? 2 : 1] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0); + } + mod = bld_base->base.gallivm->module; /* Dump LLVM IR before any optimization passes */ if (sscreen->b.debug_flags & DBG_PREOPT_IR && r600_can_dump_shader(&sscreen->b, ctx.type)) - LLVMDumpModule(mod); + ac_dump_module(mod); si_llvm_finalize_module(&ctx, r600_extra_shader_checks(&sscreen->b, ctx.type)); - /* Post-optimization transformations. */ + /* Post-optimization transformations and analysis. */ si_eliminate_const_vs_outputs(&ctx); + if ((debug && debug->debug_message) || + r600_can_dump_shader(&sscreen->b, ctx.type)) + si_count_scratch_private_memory(&ctx); + /* Compile to bytecode. */ r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, mod, debug, ctx.type, "TGSI shader"); @@ -6821,20 +7286,11 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, * LLVM 3.9svn has this bug. */ if (sel->type == PIPE_SHADER_COMPUTE) { - unsigned *props = sel->info.properties; unsigned wave_size = 64; unsigned max_vgprs = 256; unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; unsigned max_sgprs_per_wave = 128; - unsigned max_block_threads; - - if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]) - max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - else - max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - + unsigned max_block_threads = si_get_max_workgroup_size(shader); unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); @@ -6902,19 +7358,6 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, shader->info.num_input_vgprs += 1; } - if (ctx.type == PIPE_SHADER_GEOMETRY) { - shader->gs_copy_shader = CALLOC_STRUCT(si_shader); - shader->gs_copy_shader->selector = shader->selector; - ctx.shader = shader->gs_copy_shader; - r = si_generate_gs_copy_shader(sscreen, &ctx, - shader, debug); - if (r) { - free(shader->gs_copy_shader); - shader->gs_copy_shader = NULL; - return r; - } - } - return 0; } @@ -6923,22 +7366,25 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, * * \param sscreen screen * \param list list of shader parts of the same category + * \param type shader type * \param key shader part key + * \param prolog whether the part being requested is a prolog * \param tm LLVM target machine * \param debug debug callback - * \param compile the callback responsible for compilation + * \param build the callback responsible for building the main function * \return non-NULL on success */ static struct si_shader_part * si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, + enum pipe_shader_type type, + bool prolog, union si_shader_part_key *key, LLVMTargetMachineRef tm, struct pipe_debug_callback *debug, - bool (*compile)(struct si_screen *, - LLVMTargetMachineRef, - struct pipe_debug_callback *, - struct si_shader_part *)) + void (*build)(struct si_shader_context *, + union si_shader_part_key *), + const char *name) { struct si_shader_part *result; @@ -6955,24 +7401,62 @@ si_get_shader_part(struct si_screen *sscreen, /* Compile a new one. */ result = CALLOC_STRUCT(si_shader_part); result->key = *key; - if (!compile(sscreen, tm, debug, result)) { + + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.gallivm; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = type; + + switch (type) { + case PIPE_SHADER_VERTEX: + break; + case PIPE_SHADER_TESS_CTRL: + assert(!prolog); + shader.key.part.tcs.epilog = key->tcs_epilog.states; + break; + case PIPE_SHADER_GEOMETRY: + assert(prolog); + break; + case PIPE_SHADER_FRAGMENT: + if (prolog) + shader.key.part.ps.prolog = key->ps_prolog.states; + else + shader.key.part.ps.epilog = key->ps_epilog.states; + break; + default: + unreachable("bad shader part"); + } + + build(&ctx, key); + + /* Compile. */ + si_llvm_finalize_module(&ctx, + r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT)); + + if (si_compile_llvm(sscreen, &result->binary, &result->config, tm, + gallivm->module, debug, ctx.type, name)) { FREE(result); - pipe_mutex_unlock(sscreen->shader_parts_mutex); - return NULL; + result = NULL; + goto out; } result->next = *list; *list = result; + +out: + si_llvm_dispose(&ctx); pipe_mutex_unlock(sscreen->shader_parts_mutex); return result; } /** - * Create a vertex shader prolog. + * Build the vertex shader prolog function. * * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). * All inputs are returned unmodified. The vertex load indices are - * stored after them, which will used by the API VS for fetching inputs. + * stored after them, which will be used by the API VS for fetching inputs. * * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: * input_v0, @@ -6983,24 +7467,16 @@ si_get_shader_part(struct si_screen *sscreen, * (InstanceID + StartInstance), * (InstanceID / 2 + StartInstance) */ -static bool si_compile_vs_prolog(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct pipe_debug_callback *debug, - struct si_shader_part *out) +static void si_build_vs_prolog_function(struct si_shader_context *ctx, + union si_shader_part_key *key) { - union si_shader_part_key *key = &out->key; - struct si_shader shader = {}; - struct si_shader_context ctx; - struct gallivm_state *gallivm = &ctx.gallivm; + struct gallivm_state *gallivm = &ctx->gallivm; LLVMTypeRef *params, *returns; LLVMValueRef ret, func; int last_sgpr, num_params, num_returns, i; - bool status = true; - si_init_shader_ctx(&ctx, sscreen, &shader, tm); - ctx.type = PIPE_SHADER_VERTEX; - ctx.param_vertex_id = key->vs_prolog.num_input_sgprs; - ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3; + ctx->param_vertex_id = key->vs_prolog.num_input_sgprs; + ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3; /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ params = alloca((key->vs_prolog.num_input_sgprs + 4) * @@ -7014,37 +7490,37 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen, /* Declare input and output SGPRs. */ num_params = 0; for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - params[num_params++] = ctx.i32; - returns[num_returns++] = ctx.i32; + params[num_params++] = ctx->i32; + returns[num_returns++] = ctx->i32; } last_sgpr = num_params - 1; /* 4 preloaded VGPRs (outputs must be floats) */ for (i = 0; i < 4; i++) { - params[num_params++] = ctx.i32; - returns[num_returns++] = ctx.f32; + params[num_params++] = ctx->i32; + returns[num_returns++] = ctx->f32; } /* Vertex load indices. */ for (i = 0; i <= key->vs_prolog.last_input; i++) - returns[num_returns++] = ctx.f32; + returns[num_returns++] = ctx->f32; /* Create the function. */ - si_create_function(&ctx, returns, num_returns, params, + si_create_function(ctx, "vs_prolog", returns, num_returns, params, num_params, last_sgpr); - func = ctx.main_fn; + func = ctx->main_fn; /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ - ret = ctx.return_value; + ret = ctx->return_value; for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); } for (i = num_params - 4; i < num_params; i++) { LLVMValueRef p = LLVMGetParam(func, i); - p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, ""); + p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, ""); ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); } @@ -7055,37 +7531,26 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen, if (divisor) { /* InstanceID / Divisor + StartInstance */ - index = get_instance_index_for_fetch(&ctx, + index = get_instance_index_for_fetch(ctx, SI_SGPR_START_INSTANCE, divisor); } else { /* VertexID + BaseVertex */ index = LLVMBuildAdd(gallivm->builder, - LLVMGetParam(func, ctx.param_vertex_id), + LLVMGetParam(func, ctx->param_vertex_id), LLVMGetParam(func, SI_SGPR_BASE_VERTEX), ""); } - index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, ""); + index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, ""); ret = LLVMBuildInsertValue(gallivm->builder, ret, index, num_params++, ""); } - /* Compile. */ - si_llvm_build_ret(&ctx, ret); - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_VERTEX)); - - if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, - gallivm->module, debug, ctx.type, - "Vertex Shader Prolog")) - status = false; - - si_llvm_dispose(&ctx); - return status; + si_llvm_build_ret(ctx, ret); } /** - * Compile the vertex shader epilog. This is also used by the tessellation + * Build the vertex shader epilog function. This is also used by the tessellation * evaluation shader compiled as VS. * * The input is PrimitiveID. @@ -7093,21 +7558,13 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen, * If PrimitiveID is required by the pixel shader, export it. * Otherwise, do nothing. */ -static bool si_compile_vs_epilog(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct pipe_debug_callback *debug, - struct si_shader_part *out) +static void si_build_vs_epilog_function(struct si_shader_context *ctx, + union si_shader_part_key *key) { - union si_shader_part_key *key = &out->key; - struct si_shader_context ctx; - struct gallivm_state *gallivm = &ctx.gallivm; - struct lp_build_tgsi_context *bld_base = &ctx.soa.bld_base; + struct gallivm_state *gallivm = &ctx->gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; LLVMTypeRef params[5]; int num_params, i; - bool status = true; - - si_init_shader_ctx(&ctx, sscreen, NULL, tm); - ctx.type = PIPE_SHADER_VERTEX; /* Declare input VGPRs. */ num_params = key->vs_epilog.states.export_prim_id ? @@ -7115,10 +7572,10 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen, assert(num_params <= ARRAY_SIZE(params)); for (i = 0; i < num_params; i++) - params[i] = ctx.f32; + params[i] = ctx->f32; /* Create the function. */ - si_create_function(&ctx, NULL, 0, params, num_params, -1); + si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1); /* Emit exports. */ if (key->vs_epilog.states.export_prim_id) { @@ -7132,7 +7589,7 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen, args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM + key->vs_epilog.prim_id_param_offset); args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ - args[5] = LLVMGetParam(ctx.main_fn, + args[5] = LLVMGetParam(ctx->main_fn, VS_EPILOG_PRIMID_LOC); /* X */ args[6] = base->undef; /* Y */ args[7] = base->undef; /* Z */ @@ -7143,18 +7600,7 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen, args, 9, 0); } - /* Compile. */ LLVMBuildRetVoid(gallivm->builder); - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_VERTEX)); - - if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, - gallivm->module, debug, ctx.type, - "Vertex Shader Epilog")) - status = false; - - si_llvm_dispose(&ctx); - return status; } /** @@ -7168,22 +7614,13 @@ static bool si_get_vs_epilog(struct si_screen *sscreen, { union si_shader_part_key epilog_key; - memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.vs_epilog.states = *states; - - /* Set up the PrimitiveID output. */ - if (shader->key.vs.epilog.export_prim_id) { - unsigned index = shader->selector->info.num_outputs; - unsigned offset = shader->info.nr_param_exports++; - - epilog_key.vs_epilog.prim_id_param_offset = offset; - assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[index] = offset; - } + si_get_vs_epilog_key(shader, states, &epilog_key); shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs, + PIPE_SHADER_VERTEX, true, &epilog_key, tm, debug, - si_compile_vs_epilog); + si_build_vs_epilog_function, + "Vertex Shader Epilog"); return shader->epilog != NULL; } @@ -7197,35 +7634,28 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen, { struct tgsi_shader_info *info = &shader->selector->info; union si_shader_part_key prolog_key; - unsigned i; /* Get the prolog. */ - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.vs_prolog.states = shader->key.vs.prolog; - prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; - prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; + si_get_vs_prolog_key(shader, &prolog_key); /* The prolog is a no-op if there are no inputs. */ if (info->num_inputs) { shader->prolog = si_get_shader_part(sscreen, &sscreen->vs_prologs, + PIPE_SHADER_VERTEX, true, &prolog_key, tm, debug, - si_compile_vs_prolog); + si_build_vs_prolog_function, + "Vertex Shader Prolog"); if (!shader->prolog) return false; } /* Get the epilog. */ - if (!shader->key.vs.as_es && !shader->key.vs.as_ls && + if (!shader->key.as_es && !shader->key.as_ls && !si_get_vs_epilog(sscreen, tm, shader, debug, - &shader->key.vs.epilog)) + &shader->key.part.vs.epilog)) return false; - /* Set the instanceID flag. */ - for (i = 0; i < info->num_inputs; i++) - if (prolog_key.vs_prolog.states.instance_divisors[i]) - shader->info.uses_instanceid = true; - return true; } @@ -7237,78 +7667,57 @@ static bool si_shader_select_tes_parts(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug) { - if (shader->key.tes.as_es) + if (shader->key.as_es) return true; /* TES compiled as VS. */ return si_get_vs_epilog(sscreen, tm, shader, debug, - &shader->key.tes.epilog); + &shader->key.part.tes.epilog); } /** - * Compile the TCS epilog. This writes tesselation factors to memory based on - * the output primitive type of the tesselator (determined by TES). + * Compile the TCS epilog function. This writes tesselation factors to memory + * based on the output primitive type of the tesselator (determined by TES). */ -static bool si_compile_tcs_epilog(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct pipe_debug_callback *debug, - struct si_shader_part *out) +static void si_build_tcs_epilog_function(struct si_shader_context *ctx, + union si_shader_part_key *key) { - union si_shader_part_key *key = &out->key; - struct si_shader shader = {}; - struct si_shader_context ctx; - struct gallivm_state *gallivm = &ctx.gallivm; - struct lp_build_tgsi_context *bld_base = &ctx.soa.bld_base; + struct gallivm_state *gallivm = &ctx->gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; LLVMTypeRef params[16]; LLVMValueRef func; int last_sgpr, num_params; - bool status = true; - - si_init_shader_ctx(&ctx, sscreen, &shader, tm); - ctx.type = PIPE_SHADER_TESS_CTRL; - shader.key.tcs.epilog = key->tcs_epilog.states; /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ - params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS); - params[SI_PARAM_CONST_BUFFERS] = ctx.i64; - params[SI_PARAM_SAMPLERS] = ctx.i64; - params[SI_PARAM_IMAGES] = ctx.i64; - params[SI_PARAM_SHADER_BUFFERS] = ctx.i64; - params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32; - params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; - params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; - params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; - params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32; - params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32; + params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); + params[SI_PARAM_CONST_BUFFERS] = ctx->i64; + params[SI_PARAM_SAMPLERS] = ctx->i64; + params[SI_PARAM_IMAGES] = ctx->i64; + params[SI_PARAM_SHADER_BUFFERS] = ctx->i64; + params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; + params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32; + params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32; last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; num_params = last_sgpr + 1; - params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */ - params[num_params++] = ctx.i32; /* invocation ID within the patch */ - params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */ + params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */ + params[num_params++] = ctx->i32; /* invocation ID within the patch */ + params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */ /* Create the function. */ - si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr); - declare_tess_lds(&ctx); - func = ctx.main_fn; + si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr); + declare_tess_lds(ctx); + func = ctx->main_fn; si_write_tess_factors(bld_base, LLVMGetParam(func, last_sgpr + 1), LLVMGetParam(func, last_sgpr + 2), LLVMGetParam(func, last_sgpr + 3)); - /* Compile. */ LLVMBuildRetVoid(gallivm->builder); - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_TESS_CTRL)); - - if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, - gallivm->module, debug, ctx.type, - "Tessellation Control Shader Epilog")) - status = false; - - si_llvm_dispose(&ctx); - return status; } /** @@ -7323,16 +7732,42 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, /* Get the epilog. */ memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.tcs_epilog.states = shader->key.tcs.epilog; + epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, + PIPE_SHADER_TESS_CTRL, false, &epilog_key, tm, debug, - si_compile_tcs_epilog); + si_build_tcs_epilog_function, + "Tessellation Control Shader Epilog"); return shader->epilog != NULL; } /** - * Compile the pixel shader prolog. This handles: + * Select and compile (or reuse) GS parts (prolog). + */ +static bool si_shader_select_gs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + union si_shader_part_key prolog_key; + + if (!shader->key.part.gs.prolog.tri_strip_adj_fix) + return true; + + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + + shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs, + PIPE_SHADER_GEOMETRY, true, + &prolog_key, tm, debug, + si_build_gs_prolog_function, + "Geometry Shader Prolog"); + return shader->prolog != NULL; +} + +/** + * Build the pixel shader prolog function. This handles: * - two-side color selection and interpolation * - overriding interpolation parameters for the API PS * - polygon stippling @@ -7341,23 +7776,15 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, * overriden by other states. (e.g. per-sample interpolation) * Interpolated colors are stored after the preloaded VGPRs. */ -static bool si_compile_ps_prolog(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct pipe_debug_callback *debug, - struct si_shader_part *out) +static void si_build_ps_prolog_function(struct si_shader_context *ctx, + union si_shader_part_key *key) { - union si_shader_part_key *key = &out->key; - struct si_shader shader = {}; - struct si_shader_context ctx; - struct gallivm_state *gallivm = &ctx.gallivm; + struct gallivm_state *gallivm = &ctx->gallivm; LLVMTypeRef *params; LLVMValueRef ret, func; int last_sgpr, num_params, num_returns, i, num_color_channels; - bool status = true; - si_init_shader_ctx(&ctx, sscreen, &shader, tm); - ctx.type = PIPE_SHADER_FRAGMENT; - shader.key.ps.prolog = key->ps_prolog.states; + assert(si_need_ps_prolog(key)); /* Number of inputs + 8 color elements. */ params = alloca((key->ps_prolog.num_input_sgprs + @@ -7367,27 +7794,27 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, /* Declare inputs. */ num_params = 0; for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) - params[num_params++] = ctx.i32; + params[num_params++] = ctx->i32; last_sgpr = num_params - 1; for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) - params[num_params++] = ctx.f32; + params[num_params++] = ctx->f32; /* Declare outputs (same as inputs + add colors if needed) */ num_returns = num_params; num_color_channels = util_bitcount(key->ps_prolog.colors_read); for (i = 0; i < num_color_channels; i++) - params[num_returns++] = ctx.f32; + params[num_returns++] = ctx->f32; /* Create the function. */ - si_create_function(&ctx, params, num_returns, params, + si_create_function(ctx, "ps_prolog", params, num_returns, params, num_params, last_sgpr); - func = ctx.main_fn; + func = ctx->main_fn; /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ - ret = ctx.return_value; + ret = ctx->return_value; for (i = 0; i < num_params; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); @@ -7404,11 +7831,11 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS); ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI); list = lp_build_gather_values(gallivm, ptr, 2); - list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, ""); + list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, ""); list = LLVMBuildIntToPtr(gallivm->builder, list, - const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), ""); + const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), ""); - si_llvm_emit_polygon_stipple(&ctx, list, pos); + si_llvm_emit_polygon_stipple(ctx, list, pos); } if (key->ps_prolog.states.bc_optimize_for_persp || @@ -7424,9 +7851,9 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, */ bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize, - LLVMConstInt(ctx.i32, 31, 0), ""); + LLVMConstInt(ctx->i32, 31, 0), ""); bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize, - ctx.i1, ""); + ctx->i1, ""); if (key->ps_prolog.states.bc_optimize_for_persp) { /* Read PERSP_CENTER. */ @@ -7550,8 +7977,6 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, interp[1] = LLVMBuildExtractValue(gallivm->builder, ret, interp_vgpr + 1, ""); interp_ij = lp_build_gather_values(gallivm, interp, 2); - interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij, - ctx.v2i32, ""); } /* Use the absolute location of the input. */ @@ -7559,10 +7984,10 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, if (key->ps_prolog.states.color_two_side) { face = LLVMGetParam(func, face_vgpr); - face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, ""); + face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, ""); } - interp_fs_input(&ctx, + interp_fs_input(ctx, key->ps_prolog.color_attr_index[i], TGSI_SEMANTIC_COLOR, i, key->ps_prolog.num_interp_inputs, @@ -7582,51 +8007,30 @@ static bool si_compile_ps_prolog(struct si_screen *sscreen, "amdgpu-ps-wqm-outputs", ""); } - /* Compile. */ - si_llvm_build_ret(&ctx, ret); - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT)); - - if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, - gallivm->module, debug, ctx.type, - "Fragment Shader Prolog")) - status = false; - - si_llvm_dispose(&ctx); - return status; + si_llvm_build_ret(ctx, ret); } /** - * Compile the pixel shader epilog. This handles everything that must be + * Build the pixel shader epilog function. This handles everything that must be * emulated for pixel shader exports. (alpha-test, format conversions, etc) */ -static bool si_compile_ps_epilog(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct pipe_debug_callback *debug, - struct si_shader_part *out) +static void si_build_ps_epilog_function(struct si_shader_context *ctx, + union si_shader_part_key *key) { - union si_shader_part_key *key = &out->key; - struct si_shader shader = {}; - struct si_shader_context ctx; - struct gallivm_state *gallivm = &ctx.gallivm; - struct lp_build_tgsi_context *bld_base = &ctx.soa.bld_base; + struct gallivm_state *gallivm = &ctx->gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->bld_base; LLVMTypeRef params[16+8*4+3]; LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; int last_sgpr, num_params, i; - bool status = true; struct si_ps_exports exp = {}; - si_init_shader_ctx(&ctx, sscreen, &shader, tm); - ctx.type = PIPE_SHADER_FRAGMENT; - shader.key.ps.epilog = key->ps_epilog.states; - /* Declare input SGPRs. */ - params[SI_PARAM_RW_BUFFERS] = ctx.i64; - params[SI_PARAM_CONST_BUFFERS] = ctx.i64; - params[SI_PARAM_SAMPLERS] = ctx.i64; - params[SI_PARAM_IMAGES] = ctx.i64; - params[SI_PARAM_SHADER_BUFFERS] = ctx.i64; - params[SI_PARAM_ALPHA_REF] = ctx.f32; + params[SI_PARAM_RW_BUFFERS] = ctx->i64; + params[SI_PARAM_CONST_BUFFERS] = ctx->i64; + params[SI_PARAM_SAMPLERS] = ctx->i64; + params[SI_PARAM_IMAGES] = ctx->i64; + params[SI_PARAM_SHADER_BUFFERS] = ctx->i64; + params[SI_PARAM_ALPHA_REF] = ctx->f32; last_sgpr = SI_PARAM_ALPHA_REF; /* Declare input VGPRs. */ @@ -7642,12 +8046,12 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, assert(num_params <= ARRAY_SIZE(params)); for (i = last_sgpr + 1; i < num_params; i++) - params[i] = ctx.f32; + params[i] = ctx->f32; /* Create the function. */ - si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr); + si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr); /* Disable elimination of unused inputs. */ - si_llvm_add_attribute(ctx.main_fn, + si_llvm_add_attribute(ctx->main_fn, "InitialPSInputAddr", 0xffffff); /* Process colors. */ @@ -7680,7 +8084,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, int mrt = u_bit_scan(&colors_written); for (i = 0; i < 4; i++) - color[i] = LLVMGetParam(ctx.main_fn, vgpr++); + color[i] = LLVMGetParam(ctx->main_fn, vgpr++); si_export_mrt_color(bld_base, color, mrt, num_params - 1, @@ -7689,11 +8093,11 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, /* Process depth, stencil, samplemask. */ if (key->ps_epilog.writes_z) - depth = LLVMGetParam(ctx.main_fn, vgpr++); + depth = LLVMGetParam(ctx->main_fn, vgpr++); if (key->ps_epilog.writes_stencil) - stencil = LLVMGetParam(ctx.main_fn, vgpr++); + stencil = LLVMGetParam(ctx->main_fn, vgpr++); if (key->ps_epilog.writes_samplemask) - samplemask = LLVMGetParam(ctx.main_fn, vgpr++); + samplemask = LLVMGetParam(ctx->main_fn, vgpr++); if (depth || stencil || samplemask) si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); @@ -7701,20 +8105,10 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, si_export_null(bld_base); if (exp.num) - si_emit_ps_exports(&ctx, &exp); + si_emit_ps_exports(ctx, &exp); /* Compile. */ LLVMBuildRetVoid(gallivm->builder); - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT)); - - if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, - gallivm->module, debug, ctx.type, - "Fragment Shader Epilog")) - status = false; - - si_llvm_dispose(&ctx); - return status; } /** @@ -7725,175 +8119,65 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug) { - struct tgsi_shader_info *info = &shader->selector->info; union si_shader_part_key prolog_key; union si_shader_part_key epilog_key; - unsigned i; /* Get the prolog. */ - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.ps_prolog.states = shader->key.ps.prolog; - prolog_key.ps_prolog.colors_read = info->colors_read; - prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; - prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; - prolog_key.ps_prolog.wqm = info->uses_derivatives && - (prolog_key.ps_prolog.colors_read || - prolog_key.ps_prolog.states.force_persp_sample_interp || - prolog_key.ps_prolog.states.force_linear_sample_interp || - prolog_key.ps_prolog.states.force_persp_center_interp || - prolog_key.ps_prolog.states.force_linear_center_interp || - prolog_key.ps_prolog.states.bc_optimize_for_persp || - prolog_key.ps_prolog.states.bc_optimize_for_linear); - - if (info->colors_read) { - unsigned *color = shader->selector->color_attr_index; - - if (shader->key.ps.prolog.color_two_side) { - /* BCOLORs are stored after the last input. */ - prolog_key.ps_prolog.num_interp_inputs = info->num_inputs; - prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; - shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); - } - - for (i = 0; i < 2; i++) { - unsigned interp = info->input_interpolate[color[i]]; - unsigned location = info->input_interpolate_loc[color[i]]; - - if (!(info->colors_read & (0xf << i*4))) - continue; - - prolog_key.ps_prolog.color_attr_index[i] = color[i]; - - if (shader->key.ps.prolog.flatshade_colors && - interp == TGSI_INTERPOLATE_COLOR) - interp = TGSI_INTERPOLATE_CONSTANT; - - switch (interp) { - case TGSI_INTERPOLATE_CONSTANT: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - case TGSI_INTERPOLATE_COLOR: - /* Force the interpolation location for colors here. */ - if (shader->key.ps.prolog.force_persp_sample_interp) - location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.ps.prolog.force_persp_center_interp) - location = TGSI_INTERPOLATE_LOC_CENTER; - - switch (location) { - case TGSI_INTERPOLATE_LOC_SAMPLE: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0; - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_SAMPLE_ENA(1); - break; - case TGSI_INTERPOLATE_LOC_CENTER: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2; - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_CENTER_ENA(1); - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4; - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_CENTROID_ENA(1); - break; - default: - assert(0); - } - break; - case TGSI_INTERPOLATE_LINEAR: - /* Force the interpolation location for colors here. */ - if (shader->key.ps.prolog.force_linear_sample_interp) - location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.ps.prolog.force_linear_center_interp) - location = TGSI_INTERPOLATE_LOC_CENTER; - - switch (location) { - case TGSI_INTERPOLATE_LOC_SAMPLE: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6; - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_SAMPLE_ENA(1); - break; - case TGSI_INTERPOLATE_LOC_CENTER: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8; - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_CENTER_ENA(1); - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10; - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_CENTROID_ENA(1); - break; - default: - assert(0); - } - break; - default: - assert(0); - } - } - } + si_get_ps_prolog_key(shader, &prolog_key, true); /* The prolog is a no-op if these aren't set. */ - if (prolog_key.ps_prolog.colors_read || - prolog_key.ps_prolog.states.force_persp_sample_interp || - prolog_key.ps_prolog.states.force_linear_sample_interp || - prolog_key.ps_prolog.states.force_persp_center_interp || - prolog_key.ps_prolog.states.force_linear_center_interp || - prolog_key.ps_prolog.states.bc_optimize_for_persp || - prolog_key.ps_prolog.states.bc_optimize_for_linear || - prolog_key.ps_prolog.states.poly_stipple) { + if (si_need_ps_prolog(&prolog_key)) { shader->prolog = si_get_shader_part(sscreen, &sscreen->ps_prologs, + PIPE_SHADER_FRAGMENT, true, &prolog_key, tm, debug, - si_compile_ps_prolog); + si_build_ps_prolog_function, + "Fragment Shader Prolog"); if (!shader->prolog) return false; } /* Get the epilog. */ - memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.ps_epilog.colors_written = info->colors_written; - epilog_key.ps_epilog.writes_z = info->writes_z; - epilog_key.ps_epilog.writes_stencil = info->writes_stencil; - epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask; - epilog_key.ps_epilog.states = shader->key.ps.epilog; + si_get_ps_epilog_key(shader, &epilog_key); shader->epilog = si_get_shader_part(sscreen, &sscreen->ps_epilogs, + PIPE_SHADER_FRAGMENT, false, &epilog_key, tm, debug, - si_compile_ps_epilog); + si_build_ps_epilog_function, + "Fragment Shader Epilog"); if (!shader->epilog) return false; /* Enable POS_FIXED_PT if polygon stippling is enabled. */ - if (shader->key.ps.prolog.poly_stipple) { + if (shader->key.part.ps.prolog.poly_stipple) { shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); } /* Set up the enable bits for per-sample shading if needed. */ - if (shader->key.ps.prolog.force_persp_sample_interp && + if (shader->key.part.ps.prolog.force_persp_sample_interp && (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); } - if (shader->key.ps.prolog.force_linear_sample_interp && + if (shader->key.part.ps.prolog.force_linear_sample_interp && (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); } - if (shader->key.ps.prolog.force_persp_center_interp && + if (shader->key.part.ps.prolog.force_persp_center_interp && (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); } - if (shader->key.ps.prolog.force_linear_center_interp && + if (shader->key.part.ps.prolog.force_linear_center_interp && (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; @@ -7917,18 +8201,38 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, /* The sample mask input is always enabled, because the API shader always * passes it through to the epilog. Disable it here if it's unused. */ - if (!shader->key.ps.epilog.poly_line_smoothing && + if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask) shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; return true; } -static void si_fix_num_sgprs(struct si_shader *shader) +void si_multiwave_lds_size_workaround(struct si_screen *sscreen, + unsigned *lds_size) +{ + /* SPI barrier management bug: + * Make sure we have at least 4k of LDS in use to avoid the bug. + * It applies to workgroup sizes of more than one wavefront. + */ + if (sscreen->b.family == CHIP_BONAIRE || + sscreen->b.family == CHIP_KABINI || + sscreen->b.family == CHIP_MULLINS) + *lds_size = MAX2(*lds_size, 8); +} + +static void si_fix_resource_usage(struct si_screen *sscreen, + struct si_shader *shader) { unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); + + if (shader->selector->type == PIPE_SHADER_COMPUTE && + si_get_max_workgroup_size(shader) > 64) { + si_multiwave_lds_size_workaround(sscreen, + &shader->config.lds_size); + } } int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, @@ -7936,21 +8240,16 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; - struct si_shader *mainp = sel->main_shader_part; + struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); int r; /* LS, ES, VS are compiled on demand if the main part hasn't been * compiled for that stage. + * + * Vertex shaders are compiled on demand when a vertex fetch + * workaround must be applied. */ - if (!mainp || - (sel->type == PIPE_SHADER_VERTEX && - (shader->key.vs.as_es != mainp->key.vs.as_es || - shader->key.vs.as_ls != mainp->key.vs.as_ls)) || - (sel->type == PIPE_SHADER_TESS_EVAL && - shader->key.tes.as_es != mainp->key.tes.as_es) || - (sel->type == PIPE_SHADER_TESS_CTRL && - shader->key.tcs.epilog.inputs_to_copy) || - sel->type == PIPE_SHADER_COMPUTE) { + if (shader->is_monolithic) { /* Monolithic shader (compiled as a whole, has many variants, * may take a long time to compile). */ @@ -7997,6 +8296,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) return -1; break; + case PIPE_SHADER_GEOMETRY: + if (!si_shader_select_gs_parts(sscreen, tm, shader, debug)) + return -1; + break; case PIPE_SHADER_FRAGMENT: if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) return -1; @@ -8024,9 +8327,9 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, } } - si_fix_num_sgprs(shader); + si_fix_resource_usage(sscreen, shader); si_shader_dump(sscreen, shader, debug, sel->info.processor, - stderr); + stderr, true); /* Upload. */ r = si_shader_binary_upload(sscreen, shader); @@ -8040,11 +8343,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, void si_shader_destroy(struct si_shader *shader) { - if (shader->gs_copy_shader) { - si_shader_destroy(shader->gs_copy_shader); - FREE(shader->gs_copy_shader); - } - if (shader->scratch_bo) r600_resource_reference(&shader->scratch_bo, NULL);