X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.c;h=e525a1807633ad46a5dc73ec0611b5270ec89dce;hb=7ef1e42c14fb23592e8e003f7a80db9a43cb9bc9;hp=e514d6158b0fb3095aff37ef316478a57c7aeee2;hpb=29adaa19acafb76ca6b1c2131be3cf1aef958409;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e514d6158b0..c3b5f58cd26 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -19,11 +19,6 @@ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * Authors: - * Tom Stellard - * Michel Dänzer - * Christian König */ #include "gallivm/lp_bld_const.h" @@ -41,10 +36,12 @@ #include "ac_binary.h" #include "ac_llvm_util.h" +#include "ac_exp_param.h" #include "si_shader_internal.h" #include "si_pipe.h" #include "sid.h" +#include "compiler/nir/nir.h" static const char *scratch_rsrc_dword0_symbol = "SCRATCH_RSRC_DWORD0"; @@ -60,22 +57,35 @@ struct si_shader_output_values ubyte vertex_stream[4]; }; +/** + * Used to collect types and other info about arguments of the LLVM function + * before the function is created. + */ +struct si_function_info { + LLVMTypeRef types[100]; + LLVMValueRef *assign[100]; + unsigned num_sgpr_params; + unsigned num_params; +}; + +enum si_arg_regfile { + ARG_SGPR, + ARG_VGPR +}; + static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, - struct si_shader *shader, LLVMTargetMachineRef tm); static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); -static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, +static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, FILE *f); static void si_build_vs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key); -static void si_build_vs_epilog_function(struct si_shader_context *ctx, - union si_shader_part_key *key); static void si_build_tcs_epilog_function(struct si_shader_context *ctx, union si_shader_part_key *key); static void si_build_ps_prolog_function(struct si_shader_context *ctx, @@ -83,49 +93,85 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, static void si_build_ps_epilog_function(struct si_shader_context *ctx, union si_shader_part_key *key); -/* Ideally pass the sample mask input to the PS epilog as v13, which +/* Ideally pass the sample mask input to the PS epilog as v14, which * is its usual location, so that the shader doesn't have to add v_mov. */ -#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 - -/* The VS location of the PrimitiveID input is the same in the epilog, - * so that the main shader part doesn't have to move it. - */ -#define VS_EPILOG_PRIMID_LOC 2 +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 enum { CONST_ADDR_SPACE = 2, LOCAL_ADDR_SPACE = 3, }; +static bool is_merged_shader(struct si_shader *shader) +{ + if (shader->selector->screen->info.chip_class <= VI) + return false; + + return shader->key.as_ls || + shader->key.as_es || + shader->selector->type == PIPE_SHADER_TESS_CTRL || + shader->selector->type == PIPE_SHADER_GEOMETRY; +} + +static void si_init_function_info(struct si_function_info *fninfo) +{ + fninfo->num_params = 0; + fninfo->num_sgpr_params = 0; +} + +static unsigned add_arg_assign(struct si_function_info *fninfo, + enum si_arg_regfile regfile, LLVMTypeRef type, + LLVMValueRef *assign) +{ + assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params); + + unsigned idx = fninfo->num_params++; + assert(idx < ARRAY_SIZE(fninfo->types)); + + if (regfile == ARG_SGPR) + fninfo->num_sgpr_params = fninfo->num_params; + + fninfo->types[idx] = type; + fninfo->assign[idx] = assign; + return idx; +} + +static unsigned add_arg(struct si_function_info *fninfo, + enum si_arg_regfile regfile, LLVMTypeRef type) +{ + return add_arg_assign(fninfo, regfile, type, NULL); +} + +static void add_arg_assign_checked(struct si_function_info *fninfo, + enum si_arg_regfile regfile, LLVMTypeRef type, + LLVMValueRef *assign, unsigned idx) +{ + MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign); + assert(actual == idx); +} + +static void add_arg_checked(struct si_function_info *fninfo, + enum si_arg_regfile regfile, LLVMTypeRef type, + unsigned idx) +{ + add_arg_assign_checked(fninfo, regfile, type, NULL, idx); +} + /** - * Returns a unique index for a semantic name and index. The index must be - * less than 64, so that a 64-bit bitmask of used inputs or outputs can be - * calculated. + * Returns a unique index for a per-patch semantic name and index. The index + * must be less than 32, so that a 32-bit bitmask of used inputs or outputs + * can be calculated. */ -unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) +unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index) { switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: - return 0; - case TGSI_SEMANTIC_PSIZE: - return 1; - case TGSI_SEMANTIC_CLIPDIST: - assert(index <= 1); - return 2 + index; - case TGSI_SEMANTIC_GENERIC: - if (index <= 63-4) - return 4 + index; - - assert(!"invalid generic index"); - return 0; - - /* patch indices are completely separate and thus start from 0 */ case TGSI_SEMANTIC_TESSOUTER: return 0; case TGSI_SEMANTIC_TESSINNER: return 1; case TGSI_SEMANTIC_PATCH: + assert(index < 30); return 2 + index; default: @@ -134,22 +180,48 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) } } -unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index) +/** + * Returns a unique index for a semantic name and index. The index must be + * less than 64, so that a 64-bit bitmask of used inputs or outputs can be + * calculated. + */ +unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) { - switch (name) { - case TGSI_SEMANTIC_FOG: + switch (semantic_name) { + case TGSI_SEMANTIC_POSITION: + return 0; + case TGSI_SEMANTIC_GENERIC: + /* Since some shader stages use the the highest used IO index + * to determine the size to allocate for inputs/outputs + * (in LDS, tess and GS rings). GENERIC should be placed right + * after POSITION to make that size as small as possible. + */ + if (index < SI_MAX_IO_GENERIC) + return 1 + index; + + assert(!"invalid generic index"); return 0; + case TGSI_SEMANTIC_PSIZE: + return SI_MAX_IO_GENERIC + 1; + case TGSI_SEMANTIC_CLIPDIST: + assert(index <= 1); + return SI_MAX_IO_GENERIC + 2 + index; + case TGSI_SEMANTIC_FOG: + return SI_MAX_IO_GENERIC + 4; case TGSI_SEMANTIC_LAYER: - return 1; + return SI_MAX_IO_GENERIC + 5; case TGSI_SEMANTIC_VIEWPORT_INDEX: - return 2; + return SI_MAX_IO_GENERIC + 6; case TGSI_SEMANTIC_PRIMID: - return 3; + return SI_MAX_IO_GENERIC + 7; case TGSI_SEMANTIC_COLOR: /* these alias */ case TGSI_SEMANTIC_BCOLOR: - return 4 + index; + assert(index < 2); + return SI_MAX_IO_GENERIC + 8 + index; case TGSI_SEMANTIC_TEXCOORD: - return 6 + index; + assert(index < 8); + assert(SI_MAX_IO_GENERIC + 10 + index < 64); + return SI_MAX_IO_GENERIC + 10 + index; default: assert(!"invalid semantic name"); return 0; @@ -163,21 +235,19 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx, unsigned param, unsigned rshift, unsigned bitwidth) { - struct gallivm_state *gallivm = &ctx->gallivm; LLVMValueRef value = LLVMGetParam(ctx->main_fn, param); if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) - value = bitcast(&ctx->bld_base, - TGSI_TYPE_UNSIGNED, value); + value = ac_to_integer(&ctx->ac, value); if (rshift) - value = LLVMBuildLShr(gallivm->builder, value, + value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->i32, rshift, 0), ""); if (rshift + bitwidth < 32) { unsigned mask = (1 << bitwidth) - 1; - value = LLVMBuildAnd(gallivm->builder, value, + value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->i32, mask, 0), ""); } @@ -188,7 +258,7 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) { switch (ctx->type) { case PIPE_SHADER_TESS_CTRL: - return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8); + return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8); case PIPE_SHADER_TESS_EVAL: return LLVMGetParam(ctx->main_fn, @@ -224,20 +294,38 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx) { - if (ctx->type == PIPE_SHADER_VERTEX) - return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13); - else if (ctx->type == PIPE_SHADER_TESS_CTRL) - return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13); - else { - assert(0); - return NULL; - } + return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13); } -static LLVMValueRef -get_tcs_out_patch_stride(struct si_shader_context *ctx) +static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) +{ + assert(ctx->type == PIPE_SHADER_TESS_CTRL); + + if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) + return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; + + return util_last_bit64(ctx->shader->selector->outputs_written) * 4; +} + +static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) +{ + unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); + + return LLVMConstInt(ctx->i32, stride, 0); +} + +static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) { - return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13); + if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) + return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13); + + const struct tgsi_shader_info *info = &ctx->shader->selector->info; + unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); + unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); + unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + + num_patch_outputs * 4; + return LLVMConstInt(ctx->i32, patch_dw_stride, 0); } static LLVMValueRef @@ -245,7 +333,7 @@ get_tcs_out_patch0_offset(struct si_shader_context *ctx) { return lp_build_mul_imm(&ctx->bld_base.uint_bld, unpack_param(ctx, - SI_PARAM_TCS_OUT_OFFSETS, + ctx->param_tcs_out_lds_offsets, 0, 16), 4); } @@ -255,7 +343,7 @@ get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) { return lp_build_mul_imm(&ctx->bld_base.uint_bld, unpack_param(ctx, - SI_PARAM_TCS_OUT_OFFSETS, + ctx->param_tcs_out_lds_offsets, 16, 16), 4); } @@ -263,23 +351,21 @@ get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = &ctx->gallivm; LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, ""); + return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); } static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = &ctx->gallivm; LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildAdd(gallivm->builder, patch0_offset, - LLVMBuildMul(gallivm->builder, patch_stride, + return LLVMBuildAdd(ctx->ac.builder, patch0_offset, + LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""), ""); } @@ -287,33 +373,64 @@ get_tcs_out_current_patch_offset(struct si_shader_context *ctx) static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = &ctx->gallivm; LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx); LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset, - LLVMBuildMul(gallivm->builder, patch_stride, + return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset, + LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""), ""); } +static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) +{ + unsigned tcs_out_vertices = + ctx->shader->selector ? + ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; + + /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ + if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) + return LLVMConstInt(ctx->i32, tcs_out_vertices, 0); + + return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6); +} + +static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) +{ + unsigned stride; + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + stride = util_last_bit64(ctx->shader->selector->outputs_written); + return LLVMConstInt(ctx->i32, stride * 4, 0); + + case PIPE_SHADER_TESS_CTRL: + if (ctx->screen->info.chip_class >= GFX9 && + ctx->shader->is_monolithic) { + stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written); + return LLVMConstInt(ctx->i32, stride * 4, 0); + } + return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); + + default: + assert(0); + return NULL; + } +} + static LLVMValueRef get_instance_index_for_fetch( struct si_shader_context *ctx, - unsigned param_start_instance, unsigned divisor) + unsigned param_start_instance, LLVMValueRef divisor) { - struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; - - LLVMValueRef result = LLVMGetParam(ctx->main_fn, - ctx->param_instance_id); + LLVMValueRef result = ctx->abi.instance_id; /* The division must be done before START_INSTANCE is added. */ - if (divisor > 1) - result = LLVMBuildUDiv(gallivm->builder, result, - LLVMConstInt(ctx->i32, divisor, 0), ""); + if (divisor != ctx->i32_1) + result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, ""); - return LLVMBuildAdd(gallivm->builder, result, + return LLVMBuildAdd(ctx->ac.builder, result, LLVMGetParam(ctx->main_fn, param_start_instance), ""); } @@ -323,8 +440,8 @@ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, LLVMValueRef vec4, unsigned double_index) { - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context); LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, LLVMVectorType(f64, 2), ""); LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0); @@ -332,14 +449,96 @@ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, return LLVMBuildFPTrunc(builder, value, ctx->f32, ""); } -static void declare_input_vs( +static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, + LLVMValueRef i32, unsigned index) +{ + assert(index <= 1); + + if (index == 1) + return LLVMBuildAShr(ctx->ac.builder, i32, + LLVMConstInt(ctx->i32, 16, 0), ""); + + return LLVMBuildSExt(ctx->ac.builder, + LLVMBuildTrunc(ctx->ac.builder, i32, + ctx->ac.i16, ""), + ctx->i32, ""); +} + +void si_llvm_load_input_vs( struct si_shader_context *ctx, unsigned input_index, - const struct tgsi_full_declaration *decl, LLVMValueRef out[4]) { - struct lp_build_context *base = &ctx->bld_base.base; - struct gallivm_state *gallivm = base->gallivm; + unsigned vs_blit_property = + ctx->shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + + if (vs_blit_property) { + LLVMValueRef vertex_id = ctx->abi.vertex_id; + LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, + LLVMIntULE, vertex_id, + ctx->i32_1, ""); + /* Use LLVMIntNE, because we have 3 vertices and only + * the middle one should use y2. + */ + LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, + LLVMIntNE, vertex_id, + ctx->i32_1, ""); + + if (input_index == 0) { + /* Position: */ + LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs); + LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 1); + + LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); + LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); + LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); + LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); + + LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, + x1, x2, ""); + LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, + y1, y2, ""); + + out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, ""); + out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, ""); + out[2] = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 2); + out[3] = ctx->ac.f32_1; + return; + } + + /* Color or texture coordinates: */ + assert(input_index == 1); + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + for (int i = 0; i < 4; i++) { + out[i] = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 3 + i); + } + } else { + assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); + LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 3); + LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 4); + LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 5); + LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 6); + + out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, + x1, x2, ""); + out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, + y1, y2, ""); + out[2] = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 7); + out[3] = LLVMGetParam(ctx->main_fn, + ctx->param_vs_blit_inputs + 8); + } + return; + } unsigned chan; unsigned fix_fetch; @@ -353,17 +552,17 @@ static void declare_input_vs( LLVMValueRef input[3]; /* Load the T list */ - t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS); + t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); t_offset = LLVMConstInt(ctx->i32, input_index, 0); - t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); vertex_index = LLVMGetParam(ctx->main_fn, ctx->param_vertex_index0 + input_index); - fix_fetch = ctx->shader->key.mono.vs.fix_fetch[input_index]; + fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index]; /* Do multiple loads for special formats. */ switch (fix_fetch) { @@ -401,7 +600,7 @@ static void declare_input_vs( /* Break up the vec4 into individual components */ for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); - out[chan] = LLVMBuildExtractElement(gallivm->builder, + out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input[0], llvm_chan, ""); } @@ -417,9 +616,9 @@ static void declare_input_vs( /* First, recover the sign-extended signed integer value. */ if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) - tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, ""); + tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, ""); else - tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, ""); + tmp = ac_to_integer(&ctx->ac, tmp); /* For the integer-like cases, do a natural sign extension. * @@ -427,20 +626,20 @@ static void declare_input_vs( * and happen to contain 0, 1, 2, 3 as the two LSBs of the * exponent. */ - tmp = LLVMBuildShl(gallivm->builder, tmp, + tmp = LLVMBuildShl(ctx->ac.builder, tmp, fix_fetch == SI_FIX_FETCH_A2_SNORM ? LLVMConstInt(ctx->i32, 7, 0) : c30, ""); - tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, ""); + tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); /* Convert back to the right type. */ if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { LLVMValueRef clamp; LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); - tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, ""); - clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, ""); + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); + clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { - tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, ""); + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); } out[3] = tmp; @@ -449,11 +648,10 @@ static void declare_input_vs( case SI_FIX_FETCH_RGBA_32_UNORM: case SI_FIX_FETCH_RGBX_32_UNORM: for (chan = 0; chan < 4; chan++) { - out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], - ctx->i32, ""); - out[chan] = LLVMBuildUIToFP(gallivm->builder, + out[chan] = ac_to_integer(&ctx->ac, out[chan]); + out[chan] = LLVMBuildUIToFP(ctx->ac.builder, out[chan], ctx->f32, ""); - out[chan] = LLVMBuildFMul(gallivm->builder, out[chan], + out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); } /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ @@ -471,11 +669,10 @@ static void declare_input_vs( scale = 1.0 / INT_MAX; for (chan = 0; chan < 4; chan++) { - out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], - ctx->i32, ""); - out[chan] = LLVMBuildSIToFP(gallivm->builder, + out[chan] = ac_to_integer(&ctx->ac, out[chan]); + out[chan] = LLVMBuildSIToFP(ctx->ac.builder, out[chan], ctx->f32, ""); - out[chan] = LLVMBuildFMul(gallivm->builder, out[chan], + out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], LLVMConstReal(ctx->f32, scale), ""); } /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ @@ -486,17 +683,15 @@ static void declare_input_vs( } case SI_FIX_FETCH_RGBA_32_USCALED: for (chan = 0; chan < 4; chan++) { - out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], - ctx->i32, ""); - out[chan] = LLVMBuildUIToFP(gallivm->builder, + out[chan] = ac_to_integer(&ctx->ac, out[chan]); + out[chan] = LLVMBuildUIToFP(ctx->ac.builder, out[chan], ctx->f32, ""); } break; case SI_FIX_FETCH_RGBA_32_SSCALED: for (chan = 0; chan < 4; chan++) { - out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], - ctx->i32, ""); - out[chan] = LLVMBuildSIToFP(gallivm->builder, + out[chan] = ac_to_integer(&ctx->ac, out[chan]); + out[chan] = LLVMBuildSIToFP(ctx->ac.builder, out[chan], ctx->f32, ""); } break; @@ -524,7 +719,7 @@ static void declare_input_vs( case SI_FIX_FETCH_RGB_16: case SI_FIX_FETCH_RGB_16_INT: for (chan = 0; chan < 3; chan++) { - out[chan] = LLVMBuildExtractElement(gallivm->builder, + out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input[chan], ctx->i32_0, ""); } @@ -532,20 +727,26 @@ static void declare_input_vs( fix_fetch == SI_FIX_FETCH_RGB_16) { out[3] = LLVMConstReal(ctx->f32, 1); } else { - out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1, - ctx->f32, ""); + out[3] = ac_to_float(&ctx->ac, ctx->i32_1); } break; } } -static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, - unsigned swizzle) +static void declare_input_vs( + struct si_shader_context *ctx, + unsigned input_index, + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]) { - struct si_shader_context *ctx = si_shader_context(bld_base); + si_llvm_load_input_vs(ctx, input_index, out); +} +static LLVMValueRef get_primitive_id(struct si_shader_context *ctx, + unsigned swizzle) +{ if (swizzle > 0) - return bld_base->uint_bld.zero; + return ctx->i32_0; switch (ctx->type) { case PIPE_SHADER_VERTEX: @@ -553,16 +754,15 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, ctx->param_vs_prim_id); case PIPE_SHADER_TESS_CTRL: return LLVMGetParam(ctx->main_fn, - SI_PARAM_PATCH_ID); + ctx->param_tcs_patch_id); case PIPE_SHADER_TESS_EVAL: return LLVMGetParam(ctx->main_fn, ctx->param_tes_patch_id); case PIPE_SHADER_GEOMETRY: - return LLVMGetParam(ctx->main_fn, - SI_PARAM_PRIMITIVE_ID); + return ctx->abi.gs_prim_id; default: assert(0); - return bld_base->uint_bld.zero; + return ctx->i32_0; } } @@ -570,36 +770,49 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, * Return the value of tgsi_ind_register for indexing. * This is the indirect index with the constant offset added to it. */ -static LLVMValueRef get_indirect_index(struct si_shader_context *ctx, - const struct tgsi_ind_register *ind, - int rel_index) +LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, + const struct tgsi_ind_register *ind, + unsigned addr_mul, + int rel_index) { - struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; LLVMValueRef result; - result = ctx->addrs[ind->Index][ind->Swizzle]; - result = LLVMBuildLoad(gallivm->builder, result, ""); - result = LLVMBuildAdd(gallivm->builder, result, + if (ind->File == TGSI_FILE_ADDRESS) { + result = ctx->addrs[ind->Index][ind->Swizzle]; + result = LLVMBuildLoad(ctx->ac.builder, result, ""); + } else { + struct tgsi_full_src_register src = {}; + + src.Register.File = ind->File; + src.Register.Index = ind->Index; + + /* Set the second index to 0 for constants. */ + if (ind->File == TGSI_FILE_CONSTANT) + src.Register.Dimension = 1; + + result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src, + TGSI_TYPE_SIGNED, + ind->Swizzle); + result = ac_to_integer(&ctx->ac, result); + } + + if (addr_mul != 1) + result = LLVMBuildMul(ctx->ac.builder, result, + LLVMConstInt(ctx->i32, addr_mul, 0), ""); + result = LLVMBuildAdd(ctx->ac.builder, result, LLVMConstInt(ctx->i32, rel_index, 0), ""); return result; } /** - * Like get_indirect_index, but restricts the return value to a (possibly + * Like si_get_indirect_index, but restricts the return value to a (possibly * undefined) value inside [0..num). */ -static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx, - const struct tgsi_ind_register *ind, - int rel_index, unsigned num) +LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, + const struct tgsi_ind_register *ind, + int rel_index, unsigned num) { - LLVMValueRef result = get_indirect_index(ctx, ind, rel_index); - - /* LLVM 3.8: If indirect resource indexing is used: - * - SI & CIK hang - * - VI crashes - */ - if (HAVE_LLVM == 0x0308) - return LLVMGetUndef(ctx->i32); + LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index); return si_llvm_bound_index(ctx, result, num); } @@ -614,7 +827,6 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride, LLVMValueRef base_addr) { - struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; struct tgsi_shader_info *info = &ctx->shader->selector->info; ubyte *name, *index, *array_first; int first, param; @@ -639,13 +851,13 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, LLVMValueRef index; if (reg.Dimension.Indirect) - index = get_indirect_index(ctx, ®.DimIndirect, - reg.Dimension.Index); + index = si_get_indirect_index(ctx, ®.DimIndirect, + 1, reg.Dimension.Index); else index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); - base_addr = LLVMBuildAdd(gallivm->builder, base_addr, - LLVMBuildMul(gallivm->builder, index, + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, + LLVMBuildMul(ctx->ac.builder, index, vertex_dw_stride, ""), ""); } @@ -672,21 +884,26 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, else first = reg.Register.Index; - ind_index = get_indirect_index(ctx, ®.Indirect, - reg.Register.Index - first); + ind_index = si_get_indirect_index(ctx, ®.Indirect, + 1, reg.Register.Index - first); - base_addr = LLVMBuildAdd(gallivm->builder, base_addr, - LLVMBuildMul(gallivm->builder, ind_index, + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, + LLVMBuildMul(ctx->ac.builder, ind_index, LLVMConstInt(ctx->i32, 4, 0), ""), ""); - param = si_shader_io_get_unique_index(name[first], index[first]); + param = reg.Register.Dimension ? + si_shader_io_get_unique_index(name[first], index[first]) : + si_shader_io_get_unique_index_patch(name[first], index[first]); } else { - param = si_shader_io_get_unique_index(name[reg.Register.Index], - index[reg.Register.Index]); + param = reg.Register.Dimension ? + si_shader_io_get_unique_index(name[reg.Register.Index], + index[reg.Register.Index]) : + si_shader_io_get_unique_index_patch(name[reg.Register.Index], + index[reg.Register.Index]); } /* Add the base address of the element. */ - return LLVMBuildAdd(gallivm->builder, base_addr, + return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->i32, param * 4, 0), ""); } @@ -713,21 +930,20 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, LLVMValueRef vertex_index, LLVMValueRef param_index) { - struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; LLVMValueRef param_stride, constant16; - vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6); - num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9); - total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch, + vertices_per_patch = get_num_tcs_out_vertices(ctx); + num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6); + total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, ""); constant16 = LLVMConstInt(ctx->i32, 16, 0); if (vertex_index) { - base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id, + base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id, vertices_per_patch, ""); - base_addr = LLVMBuildAdd(gallivm->builder, base_addr, + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, vertex_index, ""); param_stride = total_vertices; @@ -736,17 +952,17 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, param_stride = num_patches; } - base_addr = LLVMBuildAdd(gallivm->builder, base_addr, - LLVMBuildMul(gallivm->builder, param_index, + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, + LLVMBuildMul(ctx->ac.builder, param_index, param_stride, ""), ""); - base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, ""); + base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); if (!vertex_index) { LLVMValueRef patch_data_offset = - unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16); + unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20); - base_addr = LLVMBuildAdd(gallivm->builder, base_addr, + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, ""); } return base_addr; @@ -757,7 +973,6 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( const struct tgsi_full_dst_register *dst, const struct tgsi_full_src_register *src) { - struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; struct tgsi_shader_info *info = &ctx->shader->selector->info; ubyte *name, *index, *array_first; struct tgsi_full_src_register reg; @@ -770,8 +985,8 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( if (reg.Register.Dimension) { if (reg.Dimension.Indirect) - vertex_index = get_indirect_index(ctx, ®.DimIndirect, - reg.Dimension.Index); + vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, + 1, reg.Dimension.Index); else vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); } @@ -796,18 +1011,19 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( else param_base = reg.Register.Index; - param_index = get_indirect_index(ctx, ®.Indirect, - reg.Register.Index - param_base); + param_index = si_get_indirect_index(ctx, ®.Indirect, + 1, reg.Register.Index - param_base); } else { param_base = reg.Register.Index; - param_index = LLVMConstInt(ctx->i32, 0, 0); + param_index = ctx->i32_0; } - param_index_base = si_shader_io_get_unique_index(name[param_base], - index[param_base]); + param_index_base = reg.Register.Dimension ? + si_shader_io_get_unique_index(name[param_base], index[param_base]) : + si_shader_io_get_unique_index_patch(name[param_base], index[param_base]); - param_index = LLVMBuildAdd(gallivm->builder, param_index, + param_index = LLVMBuildAdd(ctx->ac.builder, param_index, LLVMConstInt(ctx->i32, param_index_base, 0), ""); @@ -818,37 +1034,37 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type, unsigned swizzle, LLVMValueRef buffer, LLVMValueRef offset, - LLVMValueRef base, bool readonly_memory) + LLVMValueRef base, bool can_speculate) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef value, value2; LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type); LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4); if (swizzle == ~0) { value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, - 0, 1, 0, readonly_memory); + 0, 1, 0, can_speculate, false); - return LLVMBuildBitCast(gallivm->builder, value, vec_type, ""); + return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); } if (!tgsi_type_is_64bit(type)) { value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, - 0, 1, 0, readonly_memory); + 0, 1, 0, can_speculate, false); - value = LLVMBuildBitCast(gallivm->builder, value, vec_type, ""); - return LLVMBuildExtractElement(gallivm->builder, value, + value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); + return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->i32, swizzle, 0), ""); } value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, - swizzle * 4, 1, 0, readonly_memory); + swizzle * 4, 1, 0, can_speculate, false); value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, - swizzle * 4 + 4, 1, 0, readonly_memory); + swizzle * 4 + 4, 1, 0, can_speculate, false); - return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); + return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), + value, value2); } /** @@ -863,7 +1079,6 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, LLVMValueRef dw_addr) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef value; if (swizzle == ~0) { @@ -872,24 +1087,26 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) values[chan] = lds_load(bld_base, type, chan, dw_addr); - return lp_build_gather_values(bld_base->base.gallivm, values, + return lp_build_gather_values(&ctx->gallivm, values, TGSI_NUM_CHANNELS); } + /* Split 64-bit loads. */ + if (tgsi_type_is_64bit(type)) { + LLVMValueRef lo, hi; + + lo = lds_load(bld_base, TGSI_TYPE_UNSIGNED, swizzle, dw_addr); + hi = lds_load(bld_base, TGSI_TYPE_UNSIGNED, swizzle + 1, dw_addr); + return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), + lo, hi); + } + dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, LLVMConstInt(ctx->i32, swizzle, 0)); - value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false); - if (tgsi_type_is_64bit(type)) { - LLVMValueRef value2; - dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, - LLVMConstInt(ctx->i32, 1, 0)); - value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false); - return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); - } + value = ac_lds_load(&ctx->ac, dw_addr); - return LLVMBuildBitCast(gallivm->builder, value, - tgsi2llvmtype(bld_base, type), ""); + return bitcast(bld_base, type, value); } /** @@ -899,19 +1116,38 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, * \param dw_addr address in dwords * \param value value to store */ -static void lds_store(struct lp_build_tgsi_context *bld_base, - unsigned swizzle, LLVMValueRef dw_addr, +static void lds_store(struct si_shader_context *ctx, + unsigned dw_offset_imm, LLVMValueRef dw_addr, LLVMValueRef value) { - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; + dw_addr = lp_build_add(&ctx->bld_base.uint_bld, dw_addr, + LLVMConstInt(ctx->i32, dw_offset_imm, 0)); - dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, - LLVMConstInt(ctx->i32, swizzle, 0)); + ac_lds_store(&ctx->ac, dw_addr, value); +} + +static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx, + unsigned param) +{ + LLVMBuilderRef builder = ctx->ac.builder; + + LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param); + addr = LLVMBuildZExt(builder, addr, ctx->i64, ""); + addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), ""); - value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); - ac_build_indexed_store(&ctx->ac, ctx->lds, - dw_addr, value); + uint64_t desc2 = 0xffffffff; + uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0); + + LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2)); + desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, ""); + desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, ""); + return LLVMBuildBitCast(builder, desc, ctx->v4i32, ""); } static LLVMValueRef fetch_input_tcs( @@ -922,7 +1158,7 @@ static LLVMValueRef fetch_input_tcs( struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef dw_addr, stride; - stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); + stride = get_tcs_in_vertex_dw_stride(ctx); dw_addr = get_tcs_in_current_patch_offset(ctx); dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); @@ -938,7 +1174,7 @@ static LLVMValueRef fetch_output_tcs( LLVMValueRef dw_addr, stride; if (reg->Register.Dimension) { - stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + stride = get_tcs_out_vertex_dw_stride(ctx); dw_addr = get_tcs_out_current_patch_offset(ctx); dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); } else { @@ -955,14 +1191,11 @@ static LLVMValueRef fetch_input_tes( enum tgsi_opcode_type type, unsigned swizzle) { struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef rw_buffers, buffer, base, addr; + LLVMValueRef buffer, base, addr; - rw_buffers = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, - LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); + buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); - base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); + base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); return buffer_load(bld_base, type, swizzle, buffer, base, addr, true); @@ -971,30 +1204,30 @@ static LLVMValueRef fetch_input_tes( static void store_output_tcs(struct lp_build_tgsi_context *bld_base, const struct tgsi_full_instruction *inst, const struct tgsi_opcode_info *info, + unsigned index, LLVMValueRef dst[4]) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + const struct tgsi_full_dst_register *reg = &inst->Dst[index]; const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info; unsigned chan_index; LLVMValueRef dw_addr, stride; - LLVMValueRef rw_buffers, buffer, base, buf_addr; + LLVMValueRef buffer, base, buf_addr; LLVMValueRef values[4]; bool skip_lds_store; - bool is_tess_factor = false; + bool is_tess_factor = false, is_tess_inner = false; /* Only handle per-patch and per-vertex outputs here. * Vectors will be lowered to scalars and this function will be called again. */ if (reg->Register.File != TGSI_FILE_OUTPUT || (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { - si_llvm_emit_store(bld_base, inst, info, dst); + si_llvm_emit_store(bld_base, inst, info, index, dst); return; } if (reg->Register.Dimension) { - stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + stride = get_tcs_out_vertex_dw_stride(ctx); dw_addr = get_tcs_out_current_patch_offset(ctx); dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); skip_lds_store = !sh_info->reads_pervertex_outputs; @@ -1009,22 +1242,23 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, /* Always write tess factors into LDS for the TCS epilog. */ if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) { - skip_lds_store = false; + /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ + skip_lds_store = !sh_info->reads_tessfactor_outputs && + ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; is_tess_factor = true; + is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; } } } - rw_buffers = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, - LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); + buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); - base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); + base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); - - TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) { + uint32_t writemask = reg->Register.WriteMask; + while (writemask) { + chan_index = u_bit_scan(&writemask); LLVMValueRef value = dst[chan_index]; if (inst->Instruction.Saturate) @@ -1032,20 +1266,32 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, /* Skip LDS stores if there is no LDS read of this output. */ if (!skip_lds_store) - lds_store(bld_base, chan_index, dw_addr, value); + lds_store(ctx, chan_index, dw_addr, value); - value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); + value = ac_to_integer(&ctx->ac, value); values[chan_index] = value; - if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) { + if (reg->Register.WriteMask != 0xF && !is_tess_factor) { ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, buf_addr, base, 4 * chan_index, 1, 0, true, false); } + + /* Write tess factors into VGPRs for the epilog. */ + if (is_tess_factor && + ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { + if (!is_tess_inner) { + LLVMBuildStore(ctx->ac.builder, value, /* outer */ + ctx->invoc0_tess_factors[chan_index]); + } else if (chan_index < 2) { + LLVMBuildStore(ctx->ac.builder, value, /* inner */ + ctx->invoc0_tess_factors[4 + chan_index]); + } + } } - if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) { - LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm, + if (reg->Register.WriteMask == 0xF && !is_tess_factor) { + LLVMValueRef value = lp_build_gather_values(&ctx->gallivm, values, 4); ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr, base, 0, 1, 0, true, false); @@ -1058,13 +1304,10 @@ static LLVMValueRef fetch_input_gs( enum tgsi_opcode_type type, unsigned swizzle) { - struct lp_build_context *base = &bld_base->base; struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader *shader = ctx->shader; struct lp_build_context *uint = &ctx->bld_base.uint_bld; - struct gallivm_state *gallivm = base->gallivm; LLVMValueRef vtx_offset, soffset; - unsigned vtx_offset_param; struct tgsi_shader_info *info = &shader->selector->info; unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; unsigned semantic_index = info->input_semantic_index[reg->Register.Index]; @@ -1072,52 +1315,72 @@ static LLVMValueRef fetch_input_gs( LLVMValueRef value; if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) - return get_primitive_id(bld_base, swizzle); + return get_primitive_id(ctx, swizzle); if (!reg->Register.Dimension) return NULL; + param = si_shader_io_get_unique_index(semantic_name, semantic_index); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + unsigned index = reg->Dimension.Index; + + switch (index / 2) { + case 0: + vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset, + index % 2 ? 16 : 0, 16); + break; + case 1: + vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset, + index % 2 ? 16 : 0, 16); + break; + case 2: + vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset, + index % 2 ? 16 : 0, 16); + break; + default: + assert(0); + return NULL; + } + + vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, + LLVMConstInt(ctx->i32, param * 4, 0), ""); + return lds_load(bld_base, type, swizzle, vtx_offset); + } + + /* GFX6: input load from the ESGS ring in memory. */ if (swizzle == ~0) { LLVMValueRef values[TGSI_NUM_CHANNELS]; unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { values[chan] = fetch_input_gs(bld_base, reg, type, chan); } - return lp_build_gather_values(bld_base->base.gallivm, values, + return lp_build_gather_values(&ctx->gallivm, values, TGSI_NUM_CHANNELS); } - /* Get the vertex offset parameter */ - vtx_offset_param = reg->Dimension.Index; - if (vtx_offset_param < 2) { - vtx_offset_param += SI_PARAM_VTX0_OFFSET; - } else { - assert(vtx_offset_param < 6); - vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2; - } - vtx_offset = lp_build_mul_imm(uint, - LLVMGetParam(ctx->main_fn, - vtx_offset_param), - 4); + /* Get the vertex offset parameter on GFX6. */ + unsigned vtx_offset_param = reg->Dimension.Index; + LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param]; + + vtx_offset = lp_build_mul_imm(uint, gs_vtx_offset, 4); - param = si_shader_io_get_unique_index(semantic_name, semantic_index); soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); - value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, uint->zero, - vtx_offset, soffset, 0, 1, 0, true); + value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, + vtx_offset, soffset, 0, 1, 0, true, false); if (tgsi_type_is_64bit(type)) { LLVMValueRef value2; soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, - uint->zero, vtx_offset, soffset, - 0, 1, 0, true); - return si_llvm_emit_fetch_64bit(bld_base, type, + ctx->i32_0, vtx_offset, soffset, + 0, 1, 0, true, false); + return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), value, value2); } - return LLVMBuildBitCast(gallivm->builder, - value, - tgsi2llvmtype(bld_base, type), ""); + return bitcast(bld_base, type, value); } static int lookup_interp_param_index(unsigned interpolate, unsigned location) @@ -1149,6 +1412,24 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location) } } +static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, + unsigned attr_index, unsigned chan, + LLVMValueRef prim_mask, + LLVMValueRef i, LLVMValueRef j) +{ + if (i || j) { + return ac_build_fs_interp(&ctx->ac, + LLVMConstInt(ctx->i32, chan, 0), + LLVMConstInt(ctx->i32, attr_index, 0), + prim_mask, i, j); + } + return ac_build_fs_interp_mov(&ctx->ac, + LLVMConstInt(ctx->i32, 2, 0), /* P0 */ + LLVMConstInt(ctx->i32, chan, 0), + LLVMConstInt(ctx->i32, attr_index, 0), + prim_mask); +} + /** * Interpolate a fragment shader input. * @@ -1174,13 +1455,7 @@ static void interp_fs_input(struct si_shader_context *ctx, LLVMValueRef face, LLVMValueRef result[4]) { - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct lp_build_context *base = &bld_base->base; - struct lp_build_context *uint = &bld_base->uint_bld; - struct gallivm_state *gallivm = base->gallivm; - LLVMValueRef attr_number; - LLVMValueRef i, j; - + LLVMValueRef i = NULL, j = NULL; unsigned chan; /* fs.constant returns the param from the middle vertex, so it's not @@ -1198,22 +1473,19 @@ static void interp_fs_input(struct si_shader_context *ctx, */ bool interp = interp_param != NULL; - attr_number = LLVMConstInt(ctx->i32, input_index, 0); - if (interp) { - interp_param = LLVMBuildBitCast(gallivm->builder, interp_param, + interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, LLVMVectorType(ctx->f32, 2), ""); - i = LLVMBuildExtractElement(gallivm->builder, interp_param, - uint->zero, ""); - j = LLVMBuildExtractElement(gallivm->builder, interp_param, - uint->one, ""); + i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, + ctx->i32_0, ""); + j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, + ctx->i32_1, ""); } if (semantic_name == TGSI_SEMANTIC_COLOR && ctx->shader->key.part.ps.prolog.color_two_side) { LLVMValueRef is_face_positive; - LLVMValueRef back_attr_number; /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", * otherwise it's at offset "num_inputs". @@ -1222,84 +1494,62 @@ static void interp_fs_input(struct si_shader_context *ctx, if (semantic_index == 1 && colors_read_mask & 0xf) back_attr_offset += 1; - back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0); - - is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE, - face, uint->zero, ""); + is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, + face, ctx->i32_0, ""); for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); LLVMValueRef front, back; - if (interp) { - front = ac_build_fs_interp(&ctx->ac, llvm_chan, - attr_number, prim_mask, - i, j); - back = ac_build_fs_interp(&ctx->ac, llvm_chan, - back_attr_number, prim_mask, - i, j); - } else { - front = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->i32, 2, 0), /* P0 */ - llvm_chan, attr_number, prim_mask); - back = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->i32, 2, 0), /* P0 */ - llvm_chan, back_attr_number, prim_mask); - } + front = si_build_fs_interp(ctx, + input_index, chan, + prim_mask, i, j); + back = si_build_fs_interp(ctx, + back_attr_offset, chan, + prim_mask, i, j); - result[chan] = LLVMBuildSelect(gallivm->builder, + result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, ""); } } else if (semantic_name == TGSI_SEMANTIC_FOG) { - if (interp) { - result[0] = ac_build_fs_interp(&ctx->ac, uint->zero, - attr_number, prim_mask, i, j); - } else { - result[0] = ac_build_fs_interp_mov(&ctx->ac, uint->zero, - LLVMConstInt(ctx->i32, 2, 0), /* P0 */ - attr_number, prim_mask); - } + result[0] = si_build_fs_interp(ctx, input_index, + 0, prim_mask, i, j); result[1] = result[2] = LLVMConstReal(ctx->f32, 0.0f); result[3] = LLVMConstReal(ctx->f32, 1.0f); } else { for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); - - if (interp) { - result[chan] = ac_build_fs_interp(&ctx->ac, - llvm_chan, attr_number, prim_mask, i, j); - } else { - result[chan] = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->i32, 2, 0), /* P0 */ - llvm_chan, attr_number, prim_mask); - } + result[chan] = si_build_fs_interp(ctx, + input_index, chan, + prim_mask, i, j); } } } -static void declare_input_fs( +void si_llvm_load_input_fs( struct si_shader_context *ctx, unsigned input_index, - const struct tgsi_full_declaration *decl, LLVMValueRef out[4]) { struct lp_build_context *base = &ctx->bld_base.base; struct si_shader *shader = ctx->shader; + struct tgsi_shader_info *info = &shader->selector->info; LLVMValueRef main_fn = ctx->main_fn; LLVMValueRef interp_param = NULL; int interp_param_idx; + enum tgsi_semantic semantic_name = info->input_semantic_name[input_index]; + unsigned semantic_index = info->input_semantic_index[input_index]; + enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index]; + enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index]; /* Get colors from input VGPRs (set by the prolog). */ - if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { - unsigned i = decl->Semantic.Index; + if (semantic_name == TGSI_SEMANTIC_COLOR) { unsigned colors_read = shader->selector->info.colors_read; - unsigned mask = colors_read >> (i * 4); + unsigned mask = colors_read >> (semantic_index * 4); unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + - (i ? util_bitcount(colors_read & 0xf) : 0); + (semantic_index ? util_bitcount(colors_read & 0xf) : 0); out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; @@ -1308,27 +1558,30 @@ static void declare_input_fs( return; } - interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, - decl->Interp.Location); + interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc); if (interp_param_idx == -1) return; else if (interp_param_idx) { interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); } - if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR && - decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR && - ctx->shader->key.part.ps.prolog.flatshade_colors) - interp_param = NULL; /* load the constant color */ - - interp_fs_input(ctx, input_index, decl->Semantic.Name, - decl->Semantic.Index, shader->selector->info.num_inputs, + interp_fs_input(ctx, input_index, semantic_name, + semantic_index, 0, /* this param is unused */ shader->selector->info.colors_read, interp_param, LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK), LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), &out[0]); } +static void declare_input_fs( + struct si_shader_context *ctx, + unsigned input_index, + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]) +{ + si_llvm_load_input_fs(ctx, input_index, out); +} + static LLVMValueRef get_sample_id(struct si_shader_context *ctx) { return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4); @@ -1342,26 +1595,20 @@ static LLVMValueRef buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource, LLVMValueRef offset) { - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMValueRef args[2] = {resource, offset}; - - return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2, - LP_FUNC_ATTR_READNONE | - LP_FUNC_ATTR_LEGACY); + return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, + 0, 0, 0, true, true); } static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id) { struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld; - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef desc = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); + LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); - LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index); + LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8); - LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), ""); + LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), ""); LLVMValueRef pos[4] = { buffer_load_const(ctx, resource, offset0), @@ -1370,57 +1617,65 @@ static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValu LLVMConstReal(ctx->f32, 0) }; - return lp_build_gather_values(gallivm, pos, 4); + return lp_build_gather_values(&ctx->gallivm, pos, 4); } -static void declare_system_value(struct si_shader_context *ctx, - unsigned index, - const struct tgsi_full_declaration *decl) +void si_load_system_value(struct si_shader_context *ctx, + unsigned index, + const struct tgsi_full_declaration *decl) { struct lp_build_context *bld = &ctx->bld_base.base; - struct gallivm_state *gallivm = &ctx->gallivm; LLVMValueRef value = 0; + assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES); + switch (decl->Semantic.Name) { case TGSI_SEMANTIC_INSTANCEID: - value = LLVMGetParam(ctx->main_fn, - ctx->param_instance_id); + value = ctx->abi.instance_id; break; case TGSI_SEMANTIC_VERTEXID: - value = LLVMBuildAdd(gallivm->builder, - LLVMGetParam(ctx->main_fn, - ctx->param_vertex_id), - LLVMGetParam(ctx->main_fn, - SI_PARAM_BASE_VERTEX), ""); + value = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + ctx->abi.base_vertex, ""); break; case TGSI_SEMANTIC_VERTEXID_NOBASE: - value = LLVMGetParam(ctx->main_fn, - ctx->param_vertex_id); + /* Unused. Clarify the meaning in indexed vs. non-indexed + * draws if this is ever used again. */ + assert(false); break; case TGSI_SEMANTIC_BASEVERTEX: - value = LLVMGetParam(ctx->main_fn, - SI_PARAM_BASE_VERTEX); + { + /* For non-indexed draws, the base vertex set by the driver + * (for direct draws) or the CP (for indirect draws) is the + * first vertex ID, but GLSL expects 0 to be returned. + */ + LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits); + LLVMValueRef indexed; + + indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, ""); + indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, ""); + + value = LLVMBuildSelect(ctx->ac.builder, indexed, + ctx->abi.base_vertex, ctx->i32_0, ""); break; + } case TGSI_SEMANTIC_BASEINSTANCE: - value = LLVMGetParam(ctx->main_fn, - SI_PARAM_START_INSTANCE); + value = ctx->abi.start_instance; break; case TGSI_SEMANTIC_DRAWID: - value = LLVMGetParam(ctx->main_fn, - SI_PARAM_DRAWID); + value = ctx->abi.draw_id; break; case TGSI_SEMANTIC_INVOCATIONID: if (ctx->type == PIPE_SHADER_TESS_CTRL) - value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5); else if (ctx->type == PIPE_SHADER_GEOMETRY) - value = LLVMGetParam(ctx->main_fn, - SI_PARAM_GS_INSTANCE_ID); + value = ctx->abi.gs_invocation_id; else assert(!"INVOCATIONID not implemented"); break; @@ -1435,12 +1690,12 @@ static void declare_system_value(struct si_shader_context *ctx, LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)), }; - value = lp_build_gather_values(gallivm, pos, 4); + value = lp_build_gather_values(&ctx->gallivm, pos, 4); break; } case TGSI_SEMANTIC_FACE: - value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE); + value = ctx->abi.front_face; break; case TGSI_SEMANTIC_SAMPLEID: @@ -1458,7 +1713,7 @@ static void declare_system_value(struct si_shader_context *ctx, TGSI_OPCODE_FRC, pos[0]); pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_FRC, pos[1]); - value = lp_build_gather_values(gallivm, pos, 4); + value = lp_build_gather_values(&ctx->gallivm, pos, 4); break; } @@ -1474,25 +1729,25 @@ static void declare_system_value(struct si_shader_context *ctx, LLVMValueRef coord[4] = { LLVMGetParam(ctx->main_fn, ctx->param_tes_u), LLVMGetParam(ctx->main_fn, ctx->param_tes_v), - bld->zero, - bld->zero + ctx->ac.f32_0, + ctx->ac.f32_0 }; /* For triangles, the vector should be (u, v, 1-u-v). */ if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) - coord[2] = lp_build_sub(bld, bld->one, + coord[2] = lp_build_sub(bld, ctx->ac.f32_1, lp_build_add(bld, coord[0], coord[1])); - value = lp_build_gather_values(gallivm, coord, 4); + value = lp_build_gather_values(&ctx->gallivm, coord, 4); break; } case TGSI_SEMANTIC_VERTICESIN: if (ctx->type == PIPE_SHADER_TESS_CTRL) - value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6); + value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6); else if (ctx->type == PIPE_SHADER_TESS_EVAL) - value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7); + value = get_num_tcs_out_vertices(ctx); else assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); break; @@ -1500,15 +1755,12 @@ static void declare_system_value(struct si_shader_context *ctx, case TGSI_SEMANTIC_TESSINNER: case TGSI_SEMANTIC_TESSOUTER: { - LLVMValueRef rw_buffers, buffer, base, addr; - int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0); + LLVMValueRef buffer, base, addr; + int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0); - rw_buffers = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, - LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); + buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); - base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); + base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, LLVMConstInt(ctx->i32, param, 0)); @@ -1525,23 +1777,23 @@ static void declare_system_value(struct si_shader_context *ctx, int i, offset; slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); - buf = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buf = ac_build_indexed_load_const(&ctx->ac, buf, slot); + buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); + buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0; for (i = 0; i < 4; i++) val[i] = buffer_load_const(ctx, buf, LLVMConstInt(ctx->i32, (offset + i) * 4, 0)); - value = lp_build_gather_values(gallivm, val, 4); + value = lp_build_gather_values(&ctx->gallivm, val, 4); break; } case TGSI_SEMANTIC_PRIMID: - value = get_primitive_id(&ctx->bld_base, 0); + value = get_primitive_id(ctx, 0); break; case TGSI_SEMANTIC_GRID_SIZE: - value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE); + value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size); break; case TGSI_SEMANTIC_BLOCK_SIZE: @@ -1560,34 +1812,80 @@ static void declare_system_value(struct si_shader_context *ctx, for (i = 0; i < 3; ++i) values[i] = LLVMConstInt(ctx->i32, sizes[i], 0); - value = lp_build_gather_values(gallivm, values, 3); + value = lp_build_gather_values(&ctx->gallivm, values, 3); } else { - value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE); + value = LLVMGetParam(ctx->main_fn, ctx->param_block_size); } break; } case TGSI_SEMANTIC_BLOCK_ID: - value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID); + { + LLVMValueRef values[3]; + + for (int i = 0; i < 3; i++) { + values[i] = ctx->i32_0; + if (ctx->param_block_id[i] >= 0) { + values[i] = LLVMGetParam(ctx->main_fn, + ctx->param_block_id[i]); + } + } + value = lp_build_gather_values(&ctx->gallivm, values, 3); break; + } case TGSI_SEMANTIC_THREAD_ID: - value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID); + value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id); break; case TGSI_SEMANTIC_HELPER_INVOCATION: - if (HAVE_LLVM >= 0x0309) { - value = lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.ps.live", - ctx->i1, NULL, 0, - LP_FUNC_ATTR_READNONE); - value = LLVMBuildNot(gallivm->builder, value, ""); - value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, ""); + value = lp_build_intrinsic(ctx->ac.builder, + "llvm.amdgcn.ps.live", + ctx->i1, NULL, 0, + LP_FUNC_ATTR_READNONE); + value = LLVMBuildNot(ctx->ac.builder, value, ""); + value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, ""); + break; + + case TGSI_SEMANTIC_SUBGROUP_SIZE: + value = LLVMConstInt(ctx->i32, 64, 0); + break; + + case TGSI_SEMANTIC_SUBGROUP_INVOCATION: + value = ac_get_thread_id(&ctx->ac); + break; + + case TGSI_SEMANTIC_SUBGROUP_EQ_MASK: + { + LLVMValueRef id = ac_get_thread_id(&ctx->ac); + id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); + value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, ""); + value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); + break; + } + + case TGSI_SEMANTIC_SUBGROUP_GE_MASK: + case TGSI_SEMANTIC_SUBGROUP_GT_MASK: + case TGSI_SEMANTIC_SUBGROUP_LE_MASK: + case TGSI_SEMANTIC_SUBGROUP_LT_MASK: + { + LLVMValueRef id = ac_get_thread_id(&ctx->ac); + if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK || + decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) { + /* All bits set except LSB */ + value = LLVMConstInt(ctx->i64, -2, 0); } else { - assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported"); - return; + /* All bits set */ + value = LLVMConstInt(ctx->i64, -1, 0); } + id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); + value = LLVMBuildShl(ctx->ac.builder, value, id, ""); + if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK || + decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK) + value = LLVMBuildNot(ctx->ac.builder, value, ""); + value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); break; + } default: assert(!"unknown system value"); @@ -1597,35 +1895,61 @@ static void declare_system_value(struct si_shader_context *ctx, ctx->system_values[index] = value; } -static void declare_compute_memory(struct si_shader_context *ctx, - const struct tgsi_full_declaration *decl) +void si_declare_compute_memory(struct si_shader_context *ctx, + const struct tgsi_full_declaration *decl) { struct si_shader_selector *sel = ctx->shader->selector; - struct gallivm_state *gallivm = &ctx->gallivm; LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE); LLVMValueRef var; assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED); assert(decl->Range.First == decl->Range.Last); - assert(!ctx->shared_memory); + assert(!ctx->ac.lds); - var = LLVMAddGlobalInAddressSpace(gallivm->module, + var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->i8, sel->local_size), "compute_lds", LOCAL_ADDR_SPACE); LLVMSetAlignment(var, 4); - ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, ""); + ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); } static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) { LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, - SI_PARAM_CONST_BUFFERS); + ctx->param_const_and_shader_buffers); + + return ac_build_load_to_sgpr(&ctx->ac, list_ptr, + LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0)); +} + +static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); + + index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); + + return ac_build_load_to_sgpr(&ctx->ac, ptr, index); +} + +static LLVMValueRef +load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, + ctx->param_const_and_shader_buffers); - return ac_build_indexed_load_const(&ctx->ac, list_ptr, - LLVMConstInt(ctx->i32, i, 0)); + index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); + index = LLVMBuildSub(ctx->ac.builder, + LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0), + index, ""); + + return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); } static LLVMValueRef fetch_constant( @@ -1635,12 +1959,11 @@ static LLVMValueRef fetch_constant( unsigned swizzle) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct lp_build_context *base = &bld_base->base; + struct si_shader_selector *sel = ctx->shader->selector; const struct tgsi_ind_register *ireg = ®->Indirect; unsigned buf, idx; LLVMValueRef addr, bufp; - LLVMValueRef result; if (swizzle == LP_CHAN_ALL) { unsigned chan; @@ -1648,55 +1971,102 @@ static LLVMValueRef fetch_constant( for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) values[chan] = fetch_constant(bld_base, reg, type, chan); - return lp_build_gather_values(bld_base->base.gallivm, values, 4); + return lp_build_gather_values(&ctx->gallivm, values, 4); } - buf = reg->Register.Dimension ? reg->Dimension.Index : 0; - idx = reg->Register.Index * 4 + swizzle; + /* Split 64-bit loads. */ + if (tgsi_type_is_64bit(type)) { + LLVMValueRef lo, hi; - if (reg->Register.Dimension && reg->Dimension.Indirect) { - LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_CONST_BUFFERS); - LLVMValueRef index; - index = get_bounded_indirect_index(ctx, ®->DimIndirect, - reg->Dimension.Index, - SI_NUM_CONST_BUFFERS); - bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index); - } else - bufp = load_const_buffer_desc(ctx, buf); + lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle); + hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1); + return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), + lo, hi); + } + idx = reg->Register.Index * 4 + swizzle; if (reg->Register.Indirect) { - addr = ctx->addrs[ireg->Index][ireg->Swizzle]; - addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg"); - addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16); - addr = lp_build_add(&bld_base->uint_bld, addr, - LLVMConstInt(ctx->i32, idx * 4, 0)); + addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); } else { addr = LLVMConstInt(ctx->i32, idx * 4, 0); } - result = buffer_load_const(ctx, bufp, addr); - - if (!tgsi_type_is_64bit(type)) - result = bitcast(bld_base, type, result); - else { - LLVMValueRef addr2, result2; + /* Fast path when user data SGPRs point to constant buffer 0 directly. */ + if (sel->info.const_buffers_declared == 1 && + sel->info.shader_buffers_declared == 0) { + LLVMValueRef ptr = + LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); - addr2 = lp_build_add(&bld_base->uint_bld, addr, - LLVMConstInt(ctx->i32, 4, 0)); - result2 = buffer_load_const(ctx, bufp, addr2); + /* This enables use of s_load_dword and flat_load_dword for const buffer 0 + * loads, and up to x4 load opcode merging. However, it leads to horrible + * code reducing SIMD wave occupancy from 8 to 2 in many cases. + * + * Using s_buffer_load_dword (x1) seems to be the best option right now. + * + * LLVM 5.0 on SI doesn't insert a required s_nop between SALU setting + * a descriptor and s_buffer_load_dword using it, so we can't expand + * the pointer into a full descriptor like below. We have to use + * s_load_dword instead. The only case when LLVM 5.0 would select + * s_buffer_load_dword (that we have to prevent) is when we use use + * a literal offset where we don't need bounds checking. + */ + if (ctx->screen->info.chip_class == SI && + HAVE_LLVM < 0x0600 && + !reg->Register.Indirect) { + addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), ""); + LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr); + return bitcast(bld_base, type, result); + } - result = si_llvm_emit_fetch_64bit(bld_base, type, - result, result2); + /* Do the bounds checking with a descriptor, because + * doing computation and manual bounds checking of 64-bit + * addresses generates horrible VALU code with very high + * VGPR usage and very low SIMD occupancy. + */ + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, ""); + ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, ""); + + LLVMValueRef desc_elems[] = { + LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""), + LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""), + LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), + LLVMConstInt(ctx->i32, + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) + }; + LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4); + LLVMValueRef result = buffer_load_const(ctx, desc, addr); + return bitcast(bld_base, type, result); } - return result; + + assert(reg->Register.Dimension); + buf = reg->Dimension.Index; + + if (reg->Dimension.Indirect) { + LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); + LLVMValueRef index; + index = si_get_bounded_indirect_index(ctx, ®->DimIndirect, + reg->Dimension.Index, + ctx->num_const_buffers); + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); + bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index); + } else + bufp = load_const_buffer_desc(ctx, buf); + + return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr)); } /* Upper 16 bits must be zero. */ static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx, LLVMValueRef val[2]) { - return LLVMBuildOr(ctx->gallivm.builder, val[0], - LLVMBuildShl(ctx->gallivm.builder, val[1], + return LLVMBuildOr(ctx->ac.builder, val[0], + LLVMBuildShl(ctx->ac.builder, val[1], LLVMConstInt(ctx->i32, 16, 0), ""), ""); } @@ -1706,7 +2076,7 @@ static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ct LLVMValueRef val[2]) { LLVMValueRef v[2] = { - LLVMBuildAnd(ctx->gallivm.builder, val[0], + LLVMBuildAnd(ctx->ac.builder, val[0], LLVMConstInt(ctx->i32, 0xffff, 0), ""), val[1], }; @@ -1714,14 +2084,13 @@ static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ct } /* Initialize arguments for the shader export intrinsic */ -static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, +static void si_llvm_init_export_args(struct si_shader_context *ctx, LLVMValueRef *values, unsigned target, struct ac_export_args *args) { - struct si_shader_context *ctx = si_shader_context(bld_base); - struct lp_build_context *base = &bld_base->base; - LLVMBuilderRef builder = base->gallivm->builder; + LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); + LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef val[4]; unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; unsigned chan; @@ -1751,10 +2120,10 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, } args->compr = false; - args->out[0] = base->undef; - args->out[1] = base->undef; - args->out[2] = base->undef; - args->out[3] = base->undef; + args->out[0] = f32undef; + args->out[1] = f32undef; + args->out[2] = f32undef; + args->out[3] = f32undef; switch (spi_shader_col_format) { case V_028714_SPI_SHADER_ZERO: @@ -1790,9 +2159,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, LLVMValueRef packed; packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args); - args->out[chan] = - LLVMBuildBitCast(base->gallivm->builder, - packed, ctx->f32, ""); + args->out[chan] = ac_to_float(&ctx->ac, packed); } break; @@ -1808,19 +2175,17 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, } args->compr = 1; /* COMPR flag */ - args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int16(ctx, val)); - args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int16(ctx, val+2)); + args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val)); + args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2)); break; case V_028714_SPI_SHADER_SNORM16_ABGR: for (chan = 0; chan < 4; chan++) { /* Clamp between [-1, 1]. */ - val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN, + val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MIN, values[chan], LLVMConstReal(ctx->f32, 1)); - val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX, + val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MAX, val[chan], LLVMConstReal(ctx->f32, -1)); /* Convert to a signed integer in [-32767, 32767]. */ @@ -1830,17 +2195,15 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, val[chan] = LLVMBuildFAdd(builder, val[chan], LLVMBuildSelect(builder, LLVMBuildFCmp(builder, LLVMRealOGE, - val[chan], base->zero, ""), + val[chan], ctx->ac.f32_0, ""), LLVMConstReal(ctx->f32, 0.5), LLVMConstReal(ctx->f32, -0.5), ""), ""); val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, ""); } args->compr = 1; /* COMPR flag */ - args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int32_as_int16(ctx, val)); - args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int32_as_int16(ctx, val+2)); + args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val)); + args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2)); break; case V_028714_SPI_SHADER_UINT16_ABGR: { @@ -1851,17 +2214,15 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, /* Clamp. */ for (chan = 0; chan < 4; chan++) { - val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]); - val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN, + val[chan] = ac_to_integer(&ctx->ac, values[chan]); + val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_UMIN, val[chan], chan == 3 ? max_alpha : max_rgb); } args->compr = 1; /* COMPR flag */ - args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int16(ctx, val)); - args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int16(ctx, val+2)); + args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val)); + args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2)); break; } @@ -1871,26 +2232,24 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, is_int8 ? -128 : is_int10 ? -512 : -32768, 0); LLVMValueRef max_alpha = - !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 1, 0); + !is_int10 ? max_rgb : ctx->i32_1; LLVMValueRef min_alpha = !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); /* Clamp. */ for (chan = 0; chan < 4; chan++) { - val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]); - val[chan] = lp_build_emit_llvm_binary(bld_base, + val[chan] = ac_to_integer(&ctx->ac, values[chan]); + val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_IMIN, val[chan], chan == 3 ? max_alpha : max_rgb); - val[chan] = lp_build_emit_llvm_binary(bld_base, + val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_IMAX, val[chan], chan == 3 ? min_alpha : min_rgb); } args->compr = 1; /* COMPR flag */ - args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int32_as_int16(ctx, val)); - args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT, - si_llvm_pack_two_int32_as_int16(ctx, val+2)); + args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val)); + args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2)); break; } @@ -1906,22 +2265,24 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, struct si_shader_context *ctx = si_shader_context(bld_base); if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { + static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { + [PIPE_FUNC_LESS] = LLVMRealOLT, + [PIPE_FUNC_EQUAL] = LLVMRealOEQ, + [PIPE_FUNC_LEQUAL] = LLVMRealOLE, + [PIPE_FUNC_GREATER] = LLVMRealOGT, + [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, + [PIPE_FUNC_GEQUAL] = LLVMRealOGE, + }; + LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; + assert(cond); + LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF); - LLVMValueRef alpha_pass = - lp_build_cmp(&bld_base->base, - ctx->shader->key.part.ps.epilog.alpha_func, - alpha, alpha_ref); - LLVMValueRef arg = - lp_build_select(&bld_base->base, - alpha_pass, - LLVMConstReal(ctx->f32, 1.0f), - LLVMConstReal(ctx->f32, -1.0f)); - - ac_build_kill(&ctx->ac, arg); + LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); + ac_build_kill_if_false(&ctx->ac, alpha_pass); } else { - ac_build_kill(&ctx->ac, NULL); + ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0)); } } @@ -1930,41 +2291,38 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context * unsigned samplemask_param) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef coverage; /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ coverage = LLVMGetParam(ctx->main_fn, samplemask_param); - coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage); + coverage = ac_to_integer(&ctx->ac, coverage); - coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", + coverage = lp_build_intrinsic(ctx->ac.builder, "llvm.ctpop.i32", ctx->i32, &coverage, 1, LP_FUNC_ATTR_READNONE); - coverage = LLVMBuildUIToFP(gallivm->builder, coverage, + coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->f32, ""); - coverage = LLVMBuildFMul(gallivm->builder, coverage, + coverage = LLVMBuildFMul(ctx->ac.builder, coverage, LLVMConstReal(ctx->f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); - return LLVMBuildFMul(gallivm->builder, alpha, coverage, ""); + return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); } -static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base, +static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos, LLVMValueRef *out_elts) { - struct si_shader_context *ctx = si_shader_context(bld_base); - struct lp_build_context *base = &bld_base->base; unsigned reg_index; unsigned chan; unsigned const_chan; LLVMValueRef base_elt; - LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); + LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32, SI_VS_CONST_CLIP_PLANES, 0); - LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index); + LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); for (reg_index = 0; reg_index < 2; reg_index ++) { struct ac_export_args *args = &pos[2 + reg_index]; @@ -1983,8 +2341,8 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base, base_elt = buffer_load_const(ctx, const_resource, addr); args->out[chan] = - lp_build_add(base, args->out[chan], - lp_build_mul(base, base_elt, + lp_build_add(&ctx->bld_base.base, args->out[chan], + lp_build_mul(&ctx->bld_base.base, base_elt, out_elts[const_chan])); } } @@ -2024,8 +2382,6 @@ static void emit_streamout_output(struct si_shader_context *ctx, struct pipe_stream_output *stream_out, struct si_shader_output_values *shader_out) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; unsigned buf_idx = stream_out->output_buffer; unsigned start = stream_out->start_component; unsigned num_comps = stream_out->num_components; @@ -2039,9 +2395,7 @@ static void emit_streamout_output(struct si_shader_context *ctx, for (int j = 0; j < num_comps; j++) { assert(stream_out->stream == shader_out->vertex_stream[start + j]); - out[j] = LLVMBuildBitCast(builder, - shader_out->values[start + j], - ctx->i32, ""); + out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); } /* Pack the output. */ @@ -2056,7 +2410,7 @@ static void emit_streamout_output(struct si_shader_context *ctx, case 4: /* as v4i32 */ vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); for (int j = 0; j < num_comps; j++) { - vdata = LLVMBuildInsertElement(builder, vdata, out[j], + vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j], LLVMConstInt(ctx->i32, j, 0), ""); } break; @@ -2065,7 +2419,7 @@ static void emit_streamout_output(struct si_shader_context *ctx, ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps, so_write_offsets[buf_idx], - LLVMConstInt(ctx->i32, 0, 0), + ctx->i32_0, stream_out->dst_offset * 4, 1, 1, true, false); } @@ -2079,8 +2433,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, { struct si_shader_selector *sel = ctx->shader->selector; struct pipe_stream_output_info *so = &sel->so; - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; + LLVMBuilderRef builder = ctx->ac.builder; int i; struct lp_build_if_state if_ctx; @@ -2097,7 +2450,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, /* Emit the streamout code conditionally. This actually avoids * out-of-bounds buffer access. The hw tells us via the SGPR * (so_vtx_count) which threads are allowed to emit streamout data. */ - lp_build_if(&if_ctx, gallivm, can_emit); + lp_build_if(&if_ctx, &ctx->gallivm, can_emit); { /* The buffer offset is computed as follows: * ByteOffset = streamout_offset[buffer_id]*4 + @@ -2117,7 +2470,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, LLVMValueRef so_write_offset[4] = {}; LLVMValueRef so_buffers[4]; LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); + ctx->param_rw_buffers); for (i = 0; i < 4; i++) { if (!so->stride[i]) @@ -2126,7 +2479,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_VS_STREAMOUT_BUF0 + i, 0); - so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, ctx->param_streamout_offset[i]); @@ -2154,122 +2507,109 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx, lp_build_endif(&if_ctx); } +static void si_export_param(struct si_shader_context *ctx, unsigned index, + LLVMValueRef *values) +{ + struct ac_export_args args; -/* Generate export instructions for hardware VS shader stage */ -static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, - struct si_shader_output_values *outputs, - unsigned noutput) + si_llvm_init_export_args(ctx, values, + V_008DFC_SQ_EXP_PARAM + index, &args); + ac_build_export(&ctx->ac, &args); +} + +static void si_build_param_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) { - struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader *shader = ctx->shader; - struct lp_build_context *base = &bld_base->base; - struct ac_export_args args, pos_args[4] = {}; - LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; - unsigned semantic_name, semantic_index; - unsigned target; unsigned param_count = 0; - unsigned pos_idx; - int i; - - for (i = 0; i < noutput; i++) { - semantic_name = outputs[i].semantic_name; - semantic_index = outputs[i].semantic_index; - bool export_param = true; - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: /* ignore these */ - case TGSI_SEMANTIC_PSIZE: - case TGSI_SEMANTIC_CLIPVERTEX: - case TGSI_SEMANTIC_EDGEFLAG: - break; - case TGSI_SEMANTIC_GENERIC: - case TGSI_SEMANTIC_CLIPDIST: - if (shader->key.opt.hw_vs.kill_outputs & - (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index))) - export_param = false; - break; - default: - if (shader->key.opt.hw_vs.kill_outputs2 & - (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index))) - export_param = false; - break; - } + for (unsigned i = 0; i < noutput; i++) { + unsigned semantic_name = outputs[i].semantic_name; + unsigned semantic_index = outputs[i].semantic_index; if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 && outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0) - export_param = false; - -handle_semantic: - /* Select the correct target */ - switch(semantic_name) { - case TGSI_SEMANTIC_PSIZE: - psize_value = outputs[i].values[0]; - continue; - case TGSI_SEMANTIC_EDGEFLAG: - edgeflag_value = outputs[i].values[0]; continue; + + switch (semantic_name) { case TGSI_SEMANTIC_LAYER: - layer_value = outputs[i].values[0]; - semantic_name = TGSI_SEMANTIC_GENERIC; - goto handle_semantic; case TGSI_SEMANTIC_VIEWPORT_INDEX: - viewport_index_value = outputs[i].values[0]; - semantic_name = TGSI_SEMANTIC_GENERIC; - goto handle_semantic; - case TGSI_SEMANTIC_POSITION: - target = V_008DFC_SQ_EXP_POS; - break; case TGSI_SEMANTIC_CLIPDIST: - if (shader->key.opt.hw_vs.clip_disable) { - semantic_name = TGSI_SEMANTIC_GENERIC; - goto handle_semantic; - } - target = V_008DFC_SQ_EXP_POS + 2 + semantic_index; - break; - case TGSI_SEMANTIC_CLIPVERTEX: - if (shader->key.opt.hw_vs.clip_disable) - continue; - si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values); - continue; case TGSI_SEMANTIC_COLOR: case TGSI_SEMANTIC_BCOLOR: case TGSI_SEMANTIC_PRIMID: case TGSI_SEMANTIC_FOG: case TGSI_SEMANTIC_TEXCOORD: case TGSI_SEMANTIC_GENERIC: - if (!export_param) - continue; - target = V_008DFC_SQ_EXP_PARAM + param_count; - assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[i] = param_count; - param_count++; break; default: - target = 0; - fprintf(stderr, - "Warning: SI unhandled vs output type:%d\n", - semantic_name); + continue; } - si_llvm_init_export_args(bld_base, outputs[i].values, target, &args); + if ((semantic_name != TGSI_SEMANTIC_GENERIC || + semantic_index < SI_MAX_IO_GENERIC) && + shader->key.opt.kill_outputs & + (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index))) + continue; - if (target >= V_008DFC_SQ_EXP_POS && - target <= (V_008DFC_SQ_EXP_POS + 3)) { - memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); - } else { - ac_build_export(&ctx->ac, &args); - } + si_export_param(ctx, param_count, outputs[i].values); - if (semantic_name == TGSI_SEMANTIC_CLIPDIST) { - semantic_name = TGSI_SEMANTIC_GENERIC; - goto handle_semantic; - } + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count++; } shader->info.nr_param_exports = param_count; +} + +/* Generate export instructions for hardware VS shader stage */ +static void si_llvm_export_vs(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + struct si_shader *shader = ctx->shader; + struct ac_export_args pos_args[4] = {}; + LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; + unsigned pos_idx; + int i; + + /* Build position exports. */ + for (i = 0; i < noutput; i++) { + switch (outputs[i].semantic_name) { + case TGSI_SEMANTIC_POSITION: + si_llvm_init_export_args(ctx, outputs[i].values, + V_008DFC_SQ_EXP_POS, &pos_args[0]); + break; + case TGSI_SEMANTIC_PSIZE: + psize_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_LAYER: + layer_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + viewport_index_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_EDGEFLAG: + edgeflag_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_CLIPDIST: + if (!shader->key.opt.clip_disable) { + unsigned index = 2 + outputs[i].semantic_index; + si_llvm_init_export_args(ctx, outputs[i].values, + V_008DFC_SQ_EXP_POS + index, + &pos_args[index]); + } + break; + case TGSI_SEMANTIC_CLIPVERTEX: + if (!shader->key.opt.clip_disable) { + si_llvm_emit_clipvertex(ctx, pos_args, + outputs[i].values); + } + break; + } + } /* We need to add the position output manually if it's missing. */ if (!pos_args[0].out[0]) { @@ -2278,10 +2618,10 @@ handle_semantic: pos_args[0].done = 0; /* last export? */ pos_args[0].target = V_008DFC_SQ_EXP_POS; pos_args[0].compr = 0; /* COMPR flag */ - pos_args[0].out[0] = base->zero; /* X */ - pos_args[0].out[1] = base->zero; /* Y */ - pos_args[0].out[2] = base->zero; /* Z */ - pos_args[0].out[3] = base->one; /* W */ + pos_args[0].out[0] = ctx->ac.f32_0; /* X */ + pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[0].out[3] = ctx->ac.f32_1; /* W */ } /* Write the misc vector (point size, edgeflag, layer, viewport). */ @@ -2291,16 +2631,16 @@ handle_semantic: shader->selector->info.writes_layer) { pos_args[1].enabled_channels = shader->selector->info.writes_psize | (shader->selector->info.writes_edgeflag << 1) | - (shader->selector->info.writes_layer << 2) | - (shader->selector->info.writes_viewport_index << 3); + (shader->selector->info.writes_layer << 2); + pos_args[1].valid_mask = 0; /* EXEC mask */ pos_args[1].done = 0; /* last export? */ pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; pos_args[1].compr = 0; /* COMPR flag */ - pos_args[1].out[0] = base->zero; /* X */ - pos_args[1].out[1] = base->zero; /* Y */ - pos_args[1].out[2] = base->zero; /* Z */ - pos_args[1].out[3] = base->zero; /* W */ + pos_args[1].out[0] = ctx->ac.f32_0; /* X */ + pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[1].out[3] = ctx->ac.f32_0; /* W */ if (shader->selector->info.writes_psize) pos_args[1].out[0] = psize_value; @@ -2308,24 +2648,44 @@ handle_semantic: if (shader->selector->info.writes_edgeflag) { /* The output is a float, but the hw expects an integer * with the first bit containing the edge flag. */ - edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder, + edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->i32, ""); - edgeflag_value = lp_build_min(&bld_base->int_bld, + edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, - bld_base->int_bld.one); + ctx->i32_1); /* The LLVM intrinsic expects a float. */ - pos_args[1].out[1] = LLVMBuildBitCast(base->gallivm->builder, - edgeflag_value, - ctx->f32, ""); + pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); } - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; + if (ctx->screen->info.chip_class >= GFX9) { + /* GFX9 has the layer in out.z[10:0] and the viewport + * index in out.z[19:16]. + */ + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + LLVMValueRef v = viewport_index_value; + + v = ac_to_integer(&ctx->ac, v); + v = LLVMBuildShl(ctx->ac.builder, v, + LLVMConstInt(ctx->i32, 16, 0), ""); + v = LLVMBuildOr(ctx->ac.builder, v, + ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); + pos_args[1].out[2] = ac_to_float(&ctx->ac, v); + pos_args[1].enabled_channels |= 1 << 2; + } + } else { + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; - if (shader->selector->info.writes_viewport_index) - pos_args[1].out[3] = viewport_index_value; + if (shader->selector->info.writes_viewport_index) { + pos_args[1].out[3] = viewport_index_value; + pos_args[1].enabled_channels |= 1 << 3; + } + } } for (i = 0; i < 4; i++) @@ -2346,6 +2706,9 @@ handle_semantic: ac_build_export(&ctx->ac, &pos_args[i]); } + + /* Build parameter exports. */ + si_build_param_exports(ctx, outputs, noutput); } /** @@ -2355,30 +2718,25 @@ handle_semantic: static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset; + LLVMValueRef invocation_id, buffer, buffer_offset; LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base; uint64_t inputs; - invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5); + buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); + buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); - buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, - LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); - - buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); - - lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); - lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id, + lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); + lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id, lds_vertex_stride, ""); lds_base = get_tcs_in_current_patch_offset(ctx); - lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, ""); + lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, ""); - inputs = ctx->shader->key.mono.tcs.inputs_to_copy; + inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; while (inputs) { unsigned i = u_bit_scan64(&inputs); - LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base, + LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->i32, 4 * i, 0), ""); @@ -2398,18 +2756,21 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, LLVMValueRef rel_patch_id, LLVMValueRef invocation_id, - LLVMValueRef tcs_out_current_patch_data_offset) + LLVMValueRef tcs_out_current_patch_data_offset, + LLVMValueRef invoc0_tf_outer[4], + LLVMValueRef invoc0_tf_inner[2]) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; unsigned tess_inner_index, tess_outer_index; LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; - LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4]; - unsigned stride, outer_comps, inner_comps, i; + LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; + unsigned stride, outer_comps, inner_comps, i, offset; struct lp_build_if_state if_ctx, inner_if_ctx; - si_llvm_emit_barrier(NULL, bld_base, NULL); + /* Add a barrier before loading tess factors from LDS. */ + if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) + si_llvm_emit_barrier(NULL, bld_base, NULL); /* Do this only for invocation 0, because the tess levels are per-patch, * not per-vertex. @@ -2417,9 +2778,9 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, * This can't jump, because invocation 0 executes this. It should * at least mask out the loads and stores for other invocations. */ - lp_build_if(&if_ctx, gallivm, - LLVMBuildICmp(gallivm->builder, LLVMIntEQ, - invocation_id, bld_base->uint_bld.zero, "")); + lp_build_if(&if_ctx, &ctx->gallivm, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, + invocation_id, ctx->i32_0, "")); /* Determine the layout of one tess factor element in the buffer. */ switch (shader->key.part.tcs.epilog.prim_mode) { @@ -2443,32 +2804,32 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, return; } - /* Load tess_inner and tess_outer from LDS. - * Any invocation can write them, so we can't get them from a temporary. - */ - tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0); - tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0); - - lds_base = tcs_out_current_patch_data_offset; - lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, - LLVMConstInt(ctx->i32, - tess_inner_index * 4, 0), ""); - lds_outer = LLVMBuildAdd(gallivm->builder, lds_base, - LLVMConstInt(ctx->i32, - tess_outer_index * 4, 0), ""); - for (i = 0; i < 4; i++) { inner[i] = LLVMGetUndef(ctx->i32); outer[i] = LLVMGetUndef(ctx->i32); } - if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { - /* For isolines, the hardware expects tess factors in the - * reverse order from what GLSL / TGSI specify. - */ - outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); - outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); + if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { + /* Tess factors are in VGPRs. */ + for (i = 0; i < outer_comps; i++) + outer[i] = out[i] = invoc0_tf_outer[i]; + for (i = 0; i < inner_comps; i++) + inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; } else { + /* Load tess_inner and tess_outer from LDS. + * Any invocation can write them, so we can't get them from a temporary. + */ + tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); + tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); + + lds_base = tcs_out_current_patch_data_offset; + lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->i32, + tess_inner_index * 4, 0), ""); + lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->i32, + tess_outer_index * 4, 0), ""); + for (i = 0; i < outer_comps; i++) { outer[i] = out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); @@ -2479,45 +2840,56 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, } } + if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { + /* For isolines, the hardware expects tess factors in the + * reverse order from what GLSL / TGSI specify. + */ + LLVMValueRef tmp = out[0]; + out[0] = out[1]; + out[1] = tmp; + } + /* Convert the outputs to vectors for stores. */ - vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); + vec0 = lp_build_gather_values(&ctx->gallivm, out, MIN2(stride, 4)); vec1 = NULL; if (stride > 4) - vec1 = lp_build_gather_values(gallivm, out+4, stride - 4); + vec1 = lp_build_gather_values(&ctx->gallivm, out+4, stride - 4); /* Get the buffer. */ - rw_buffers = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, - LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0)); + buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k); /* Get the offset. */ tf_base = LLVMGetParam(ctx->main_fn, - SI_PARAM_TESS_FACTOR_OFFSET); - byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id, + ctx->param_tcs_factor_offset); + byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->i32, 4 * stride, 0), ""); - lp_build_if(&inner_if_ctx, gallivm, - LLVMBuildICmp(gallivm->builder, LLVMIntEQ, - rel_patch_id, bld_base->uint_bld.zero, "")); + lp_build_if(&inner_if_ctx, &ctx->gallivm, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, + rel_patch_id, ctx->i32_0, "")); /* Store the dynamic HS control word. */ - ac_build_buffer_store_dword(&ctx->ac, buffer, - LLVMConstInt(ctx->i32, 0x80000000, 0), - 1, LLVMConstInt(ctx->i32, 0, 0), tf_base, - 0, 1, 0, true, false); + offset = 0; + if (ctx->screen->info.chip_class <= VI) { + ac_build_buffer_store_dword(&ctx->ac, buffer, + LLVMConstInt(ctx->i32, 0x80000000, 0), + 1, ctx->i32_0, tf_base, + offset, 1, 0, true, false); + offset += 4; + } lp_build_endif(&inner_if_ctx); /* Store the tessellation factors. */ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, - 4, 1, 0, true, false); + offset, 1, 0, true, false); + offset += 16; if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, - 20, 1, 0, true, false); + offset, 1, 0, true, false); /* Store the tess factors into the offchip buffer if TES reads them. */ if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { @@ -2525,29 +2897,28 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, LLVMValueRef tf_inner_offset; unsigned param_outer, param_inner; - buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers, - LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); - base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); + buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); + base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); - param_outer = si_shader_io_get_unique_index( + param_outer = si_shader_io_get_unique_index_patch( TGSI_SEMANTIC_TESSOUTER, 0); tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, LLVMConstInt(ctx->i32, param_outer, 0)); - outer_vec = lp_build_gather_values(gallivm, outer, + outer_vec = lp_build_gather_values(&ctx->gallivm, outer, util_next_power_of_two(outer_comps)); ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0, 1, 0, true, false); if (inner_comps) { - param_inner = si_shader_io_get_unique_index( + param_inner = si_shader_io_get_unique_index_patch( TGSI_SEMANTIC_TESSINNER, 0); tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, LLVMConstInt(ctx->i32, param_inner, 0)); inner_vec = inner_comps == 1 ? inner[0] : - lp_build_gather_values(gallivm, inner, inner_comps); + lp_build_gather_values(&ctx->gallivm, inner, inner_comps); ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base, 0, 1, 0, true, false); @@ -2557,145 +2928,366 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, lp_build_endif(&if_ctx); } +static LLVMValueRef +si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, + unsigned param, unsigned return_index) +{ + return LLVMBuildInsertValue(ctx->ac.builder, ret, + LLVMGetParam(ctx->main_fn, param), + return_index, ""); +} + +static LLVMValueRef +si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, + unsigned param, unsigned return_index) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef p = LLVMGetParam(ctx->main_fn, param); + + return LLVMBuildInsertValue(builder, ret, + ac_to_float(&ctx->ac, p), + return_index, ""); +} + +static LLVMValueRef +si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret, + unsigned param, unsigned return_index) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef ptr, lo, hi; + + ptr = LLVMGetParam(ctx->main_fn, param); + ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, ""); + ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, ""); + lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, ""); + hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, ""); + ret = LLVMBuildInsertValue(builder, ret, lo, return_index, ""); + return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, ""); +} + /* This only writes the tessellation factor levels. */ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); + LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; - LLVMValueRef offchip_soffset, offchip_layout; si_copy_tcs_inputs(bld_base); rel_patch_id = get_rel_patch_id(ctx); - invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5); tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); + if (ctx->screen->info.chip_class >= GFX9) { + LLVMBasicBlockRef blocks[2] = { + LLVMGetInsertBlock(builder), + ctx->merged_wrap_if_state.entry_block + }; + LLVMValueRef values[2]; + + lp_build_endif(&ctx->merged_wrap_if_state); + + values[0] = rel_patch_id; + values[1] = LLVMGetUndef(ctx->i32); + rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); + + values[0] = tf_lds_offset; + values[1] = LLVMGetUndef(ctx->i32); + tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); + + values[0] = invocation_id; + values[1] = ctx->i32_1; /* cause the epilog to skip threads */ + invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); + } + /* Return epilog parameters from this function. */ - LLVMBuilderRef builder = bld_base->base.gallivm->builder; LLVMValueRef ret = ctx->return_value; - LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; unsigned vgpr; - /* RW_BUFFERS pointer */ - rw_buffers = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); - rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); - rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); - rw0 = LLVMBuildExtractElement(builder, rw_buffers, - bld_base->uint_bld.zero, ""); - rw1 = LLVMBuildExtractElement(builder, rw_buffers, - bld_base->uint_bld.one, ""); - ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); - ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); - - /* Tess offchip and factor buffer soffset are after user SGPRs. */ - offchip_layout = LLVMGetParam(ctx->main_fn, - SI_PARAM_TCS_OFFCHIP_LAYOUT); - offchip_soffset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); - tf_soffset = LLVMGetParam(ctx->main_fn, - SI_PARAM_TESS_FACTOR_OFFSET); - ret = LLVMBuildInsertValue(builder, ret, offchip_layout, - SI_SGPR_TCS_OFFCHIP_LAYOUT, ""); - ret = LLVMBuildInsertValue(builder, ret, offchip_soffset, - SI_TCS_NUM_USER_SGPR, ""); - ret = LLVMBuildInsertValue(builder, ret, tf_soffset, - SI_TCS_NUM_USER_SGPR + 1, ""); + if (ctx->screen->info.chip_class >= GFX9) { + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, + 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k, + 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k, + 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K); + /* Tess offchip and tess factor offsets are at the beginning. */ + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); + vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1; + } else { + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, + GFX6_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k, + GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k, + GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K); + /* Tess offchip and tess factor offsets are after user SGPRs. */ + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, + GFX6_TCS_NUM_USER_SGPR); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, + GFX6_TCS_NUM_USER_SGPR + 1); + vgpr = GFX6_TCS_NUM_USER_SGPR + 2; + } /* VGPRs */ - rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); - invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); - tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); + rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); + invocation_id = ac_to_float(&ctx->ac, invocation_id); + tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); + + /* Leave a hole corresponding to the two input VGPRs. This ensures that + * the invocation_id output does not alias the param_tcs_rel_ids input, + * which saves a V_MOV on gfx9. + */ + vgpr += 2; - vgpr = SI_TCS_NUM_USER_SGPR + 2; ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); - ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + + if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { + vgpr++; /* skip the tess factor LDS offset */ + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef value = + LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); + value = ac_to_float(&ctx->ac, value); + ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); + } + } else { + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + } ctx->return_value = ret; } -static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) +/* Pass TCS inputs from LS to TCS on GFX9. */ +static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) { - struct si_shader_context *ctx = si_shader_context(bld_base); + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); + ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); + + ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, + 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr_as_2xi32(ctx, ret, + ctx->param_bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + + ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits, + 8 + SI_SGPR_VS_STATE_BITS); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, + 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets, + 8 + GFX9_SGPR_TCS_OUT_OFFSETS); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, + 8 + GFX9_SGPR_TCS_OUT_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k, + 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K); + ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k, + 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K); + + unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2; + ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param, + 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS); + ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1, + 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES); + + unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; + ret = si_insert_input_ret_float(ctx, ret, + ctx->param_tcs_patch_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, + ctx->param_tcs_rel_ids, vgpr++); + ctx->return_value = ret; +} + +/* Pass GS inputs from ES to GS on GFX9. */ +static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) +{ + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); + + ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, + 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr_as_2xi32(ctx, ret, + ctx->param_bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + + unsigned desc_param = ctx->param_vs_state_bits + 1; + ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param, + 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS); + ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1, + 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES); + + unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR; + for (unsigned i = 0; i < 5; i++) { + unsigned param = ctx->param_gs_vtx01_offset + i; + ret = si_insert_input_ret_float(ctx, ret, param, vgpr++); + } + ctx->return_value = ret; +} + +static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct si_shader *shader = ctx->shader; struct tgsi_shader_info *info = &shader->selector->info; - struct gallivm_state *gallivm = bld_base->base.gallivm; unsigned i, chan; LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, ctx->param_rel_auto_id); - LLVMValueRef vertex_dw_stride = - unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8); - LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id, + LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); + LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, ""); /* Write outputs to LDS. The next shader (TCS aka HS) will read * its inputs from it. */ for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = ctx->outputs[i]; unsigned name = info->output_semantic_name[i]; unsigned index = info->output_semantic_index[i]; + + /* The ARB_shader_viewport_layer_array spec contains the + * following issue: + * + * 2) What happens if gl_ViewportIndex or gl_Layer is + * written in the vertex shader and a geometry shader is + * present? + * + * RESOLVED: The value written by the last vertex processing + * stage is used. If the last vertex processing stage + * (vertex, tessellation evaluation or geometry) does not + * statically assign to gl_ViewportIndex or gl_Layer, index + * or layer zero is assumed. + * + * So writes to those outputs in VS-as-LS are simply ignored. + */ + if (name == TGSI_SEMANTIC_LAYER || + name == TGSI_SEMANTIC_VIEWPORT_INDEX) + continue; + int param = si_shader_io_get_unique_index(name, index); - LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr, + LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->i32, param * 4, 0), ""); for (chan = 0; chan < 4; chan++) { - lds_store(bld_base, chan, dw_addr, - LLVMBuildLoad(gallivm->builder, out_ptr[chan], "")); + if (!(info->output_usagemask[i] & (1 << chan))) + continue; + + lds_store(ctx, chan, dw_addr, + LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); } } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_ls_return_value_for_tcs(ctx); } -static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) +static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct si_shader *es = ctx->shader; struct tgsi_shader_info *info = &es->selector->info; LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, ctx->param_es2gs_offset); + LLVMValueRef lds_base = NULL; unsigned chan; int i; + if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { + unsigned itemsize_dw = es->selector->esgs_itemsize / 4; + LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); + LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); + vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, + LLVMBuildMul(ctx->ac.builder, wave_idx, + LLVMConstInt(ctx->i32, 64, false), ""), ""); + lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, + LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); + } + for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = ctx->outputs[i]; - int param_index; + int param; if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) continue; - param_index = si_shader_io_get_unique_index(info->output_semantic_name[i], - info->output_semantic_index[i]); + param = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i]); for (chan = 0; chan < 4; chan++) { - LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); - out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + out_val = ac_to_integer(&ctx->ac, out_val); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + lds_store(ctx, param * 4 + chan, lds_base, out_val); + continue; + } ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL, soffset, - (4 * param_index + chan) * 4, + (4 * param + chan) * 4, 1, 1, true, true); } } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_es_return_value_for_gs(ctx); } -static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) +static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) { - struct si_shader_context *ctx = si_shader_context(bld_base); + if (ctx->screen->info.chip_class >= GFX9) + return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8); + else + return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id); +} +static void emit_gs_epilogue(struct si_shader_context *ctx) +{ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, - LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); + si_get_gs_wave_id(ctx)); + + if (ctx->screen->info.chip_class >= GFX9) + lp_build_endif(&ctx->merged_wrap_if_state); +} + +static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info; + + assert(info->num_outputs <= max_outputs); + + emit_gs_epilogue(ctx); } -static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) +static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; + emit_gs_epilogue(ctx); +} + +static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct tgsi_shader_info *info = &ctx->shader->selector->info; struct si_shader_output_values *outputs = NULL; int i,j; assert(!ctx->shader->is_gs_copy_shader); + assert(info->num_outputs <= max_outputs); outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); @@ -2719,17 +3311,17 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) if (!cond) { /* The state is in the first bit of the user SGPR. */ cond = LLVMGetParam(ctx->main_fn, - SI_PARAM_VS_STATE_BITS); - cond = LLVMBuildTrunc(gallivm->builder, cond, + ctx->param_vs_state_bits); + cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, ""); - lp_build_if(&if_ctx, gallivm, cond); + lp_build_if(&if_ctx, &ctx->gallivm, cond); } for (j = 0; j < 4; j++) { - addr = ctx->outputs[i][j]; - val = LLVMBuildLoad(gallivm->builder, addr, ""); + addr = addrs[4 * i + j]; + val = LLVMBuildLoad(ctx->ac.builder, addr, ""); val = ac_build_clamp(&ctx->ac, val); - LLVMBuildStore(gallivm->builder, val, addr); + LLVMBuildStore(ctx->ac.builder, val, addr); } } @@ -2743,29 +3335,42 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) for (j = 0; j < 4; j++) { outputs[i].values[j] = - LLVMBuildLoad(gallivm->builder, - ctx->outputs[i][j], + LLVMBuildLoad(ctx->ac.builder, + addrs[4 * i + j], ""); outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3; } - } - /* Return the primitive ID from the LLVM function. */ - ctx->return_value = - LLVMBuildInsertValue(gallivm->builder, - ctx->return_value, - bitcast(bld_base, TGSI_TYPE_FLOAT, - get_primitive_id(bld_base, 0)), - VS_EPILOG_PRIMID_LOC, ""); - if (ctx->shader->selector->so.num_outputs) si_llvm_emit_streamout(ctx, outputs, i, 0); - si_llvm_export_vs(bld_base, outputs, i); + + /* Export PrimitiveID. */ + if (ctx->shader->key.mono.u.vs_export_prim_id) { + outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; + outputs[i].semantic_index = 0; + outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0)); + for (j = 1; j < 4; j++) + outputs[i].values[j] = LLVMConstReal(ctx->f32, 0); + + memset(outputs[i].vertex_stream, 0, + sizeof(outputs[i].vertex_stream)); + i++; + } + + si_llvm_export_vs(ctx, outputs, i); FREE(outputs); } +static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + + ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS, + &ctx->outputs[0][0]); +} + struct si_ps_exports { unsigned num; struct ac_export_args args[10]; @@ -2822,10 +3427,10 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, if (stencil) { /* Stencil should be in X[23:16]. */ - stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil); - stencil = LLVMBuildShl(base->gallivm->builder, stencil, + stencil = ac_to_integer(&ctx->ac, stencil); + stencil = LLVMBuildShl(ctx->ac.builder, stencil, LLVMConstInt(ctx->i32, 16, 0), ""); - args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil); + args.out[0] = ac_to_float(&ctx->ac, stencil); mask |= 0x3; } if (samplemask) { @@ -2850,9 +3455,9 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, /* SI (except OLAND and HAINAN) has a bug that it only looks * at the X writemask component. */ - if (ctx->screen->b.chip_class == SI && - ctx->screen->b.family != CHIP_OLAND && - ctx->screen->b.family != CHIP_HAINAN) + if (ctx->screen->info.chip_class == SI && + ctx->screen->info.family != CHIP_OLAND && + ctx->screen->info.family != CHIP_HAINAN) mask |= 0x1; /* Specify which components to enable */ @@ -2867,7 +3472,6 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, bool is_last, struct si_ps_exports *exp) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct lp_build_context *base = &bld_base->base; int i; /* Clamp color */ @@ -2877,7 +3481,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, /* Alpha to one */ if (ctx->shader->key.part.ps.epilog.alpha_to_one) - color[3] = base->one; + color[3] = ctx->ac.f32_1; /* Alpha test */ if (index == 0 && @@ -2896,7 +3500,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, /* Get the export arguments, also find out what the last one is. */ for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { - si_llvm_init_export_args(bld_base, color, + si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + c, &args[c]); if (args[c].enabled_channels) last = c; @@ -2916,7 +3520,7 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, struct ac_export_args args; /* Export */ - si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index, + si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, &args); if (is_last) { args.valid_mask = 1; /* whether the EXEC mask is valid */ @@ -2967,19 +3571,23 @@ static void si_export_null(struct lp_build_tgsi_context *bld_base) * * The alpha-ref SGPR is returned via its original location. */ -static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) +static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct si_shader *shader = ctx->shader; - struct lp_build_context *base = &bld_base->base; struct tgsi_shader_info *info = &shader->selector->info; - LLVMBuilderRef builder = base->gallivm->builder; + LLVMBuilderRef builder = ctx->ac.builder; unsigned i, j, first_vgpr, vgpr; LLVMValueRef color[8][4] = {}; LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; LLVMValueRef ret; + if (ctx->postponed_kill) + ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); + /* Read the output values. */ for (i = 0; i < info->num_outputs; i++) { unsigned semantic_name = info->output_semantic_name[i]; @@ -2989,22 +3597,22 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) case TGSI_SEMANTIC_COLOR: assert(semantic_index < 8); for (j = 0; j < 4; j++) { - LLVMValueRef ptr = ctx->outputs[i][j]; + LLVMValueRef ptr = addrs[4 * i + j]; LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); color[semantic_index][j] = result; } break; case TGSI_SEMANTIC_POSITION: depth = LLVMBuildLoad(builder, - ctx->outputs[i][2], ""); + addrs[4 * i + 2], ""); break; case TGSI_SEMANTIC_STENCIL: stencil = LLVMBuildLoad(builder, - ctx->outputs[i][1], ""); + addrs[4 * i + 1], ""); break; case TGSI_SEMANTIC_SAMPLEMASK: samplemask = LLVMBuildLoad(builder, - ctx->outputs[i][0], ""); + addrs[4 * i + 0], ""); break; default: fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", @@ -3017,9 +3625,9 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) /* Set SGPRs. */ ret = LLVMBuildInsertValue(builder, ret, - bitcast(bld_base, TGSI_TYPE_SIGNED, - LLVMGetParam(ctx->main_fn, - SI_PARAM_ALPHA_REF)), + ac_to_integer(&ctx->ac, + LLVMGetParam(ctx->main_fn, + SI_PARAM_ALPHA_REF)), SI_SGPR_ALPHA_REF, ""); /* Set VGPRs */ @@ -3048,71 +3656,12 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) ctx->return_value = ret; } -/** - * Given a v8i32 resource descriptor for a buffer, extract the size of the - * buffer in number of elements and return it as an i32. - */ -static LLVMValueRef get_buffer_size( - struct lp_build_tgsi_context *bld_base, - LLVMValueRef descriptor) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef size = - LLVMBuildExtractElement(builder, descriptor, - LLVMConstInt(ctx->i32, 2, 0), ""); - - if (ctx->screen->b.chip_class == VI) { - /* On VI, the descriptor contains the size in bytes, - * but TXQ must return the size in elements. - * The stride is always non-zero for resources using TXQ. - */ - LLVMValueRef stride = - LLVMBuildExtractElement(builder, descriptor, - LLVMConstInt(ctx->i32, 1, 0), ""); - stride = LLVMBuildLShr(builder, stride, - LLVMConstInt(ctx->i32, 16, 0), ""); - stride = LLVMBuildAnd(builder, stride, - LLVMConstInt(ctx->i32, 0x3FFF, 0), ""); - - size = LLVMBuildUDiv(builder, size, stride, ""); - } - - return size; -} - -static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data); - -/* Prevent optimizations (at least of memory accesses) across the current - * point in the program by emitting empty inline assembly that is marked as - * having side effects. - */ -#if 0 /* unused currently */ -static void emit_optimization_barrier(struct si_shader_context *ctx) -{ - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); - LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false); - LLVMBuildCall(builder, inlineasm, NULL, 0, ""); -} -#endif - -/* Combine these with & instead of |. */ -#define NOOP_WAITCNT 0xf7f -#define LGKM_CNT 0x07f -#define VM_CNT 0xf70 - -static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16) +void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; LLVMValueRef args[1] = { LLVMConstInt(ctx->i32, simm16, 0) }; - lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt", + lp_build_intrinsic(ctx->ac.builder, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0); } @@ -3138,1775 +3687,31 @@ static void membar_emit( waitcnt &= LGKM_CNT; if (waitcnt != NOOP_WAITCNT) - emit_waitcnt(ctx, waitcnt); + si_emit_waitcnt(ctx, waitcnt); } static void clock_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef tmp; - - tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter", - ctx->i64, NULL, 0, 0); - tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, ""); - - emit_data->output[0] = - LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, ""); - emit_data->output[1] = - LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, ""); -} - -static LLVMValueRef -shader_buffer_fetch_rsrc(struct si_shader_context *ctx, - const struct tgsi_full_src_register *reg) -{ - LLVMValueRef index; - LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, - SI_PARAM_SHADER_BUFFERS); - - if (!reg->Register.Indirect) - index = LLVMConstInt(ctx->i32, reg->Register.Index, 0); - else - index = get_bounded_indirect_index(ctx, ®->Indirect, - reg->Register.Index, - SI_NUM_SHADER_BUFFERS); - - return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index); -} - -static bool tgsi_is_array_sampler(unsigned target) -{ - return target == TGSI_TEXTURE_1D_ARRAY || - target == TGSI_TEXTURE_SHADOW1D_ARRAY || - target == TGSI_TEXTURE_2D_ARRAY || - target == TGSI_TEXTURE_SHADOW2D_ARRAY || - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY || - target == TGSI_TEXTURE_2D_ARRAY_MSAA; -} - -static bool tgsi_is_array_image(unsigned target) -{ - return target == TGSI_TEXTURE_3D || - target == TGSI_TEXTURE_CUBE || - target == TGSI_TEXTURE_1D_ARRAY || - target == TGSI_TEXTURE_2D_ARRAY || - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_2D_ARRAY_MSAA; -} - -/** - * Given a 256-bit resource descriptor, force the DCC enable bit to off. - * - * At least on Tonga, executing image stores on images with DCC enabled and - * non-trivial can eventually lead to lockups. This can occur when an - * application binds an image as read-only but then uses a shader that writes - * to it. The OpenGL spec allows almost arbitrarily bad behavior (including - * program termination) in this case, but it doesn't cost much to be a bit - * nicer: disabling DCC in the shader still leads to undefined results but - * avoids the lockup. - */ -static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, - LLVMValueRef rsrc) -{ - if (ctx->screen->b.chip_class <= CIK) { - return rsrc; - } else { - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0); - LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0); - LLVMValueRef tmp; - - tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, ""); - tmp = LLVMBuildAnd(builder, tmp, i32_C, ""); - return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, ""); - } -} - -static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) -{ - return LLVMPointerType(LLVMArrayType(elem_type, num_elements), - CONST_ADDR_SPACE); -} - -static LLVMValueRef load_image_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - unsigned target) -{ - LLVMBuilderRef builder = ctx->gallivm.builder; - - if (target == TGSI_TEXTURE_BUFFER) { - index = LLVMBuildMul(builder, index, - LLVMConstInt(ctx->i32, 2, 0), ""); - index = LLVMBuildAdd(builder, index, - LLVMConstInt(ctx->i32, 1, 0), ""); - list = LLVMBuildPointerCast(builder, list, - const_array(ctx->v4i32, 0), ""); - } - - return ac_build_indexed_load_const(&ctx->ac, list, index); -} - -/** - * Load the resource descriptor for \p image. - */ -static void -image_fetch_rsrc( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_src_register *image, - bool is_store, unsigned target, - LLVMValueRef *rsrc) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, - SI_PARAM_IMAGES); - LLVMValueRef index; - bool dcc_off = is_store; - - assert(image->Register.File == TGSI_FILE_IMAGE); - - if (!image->Register.Indirect) { - const struct tgsi_shader_info *info = bld_base->info; - unsigned images_writemask = info->images_store | - info->images_atomic; - - index = LLVMConstInt(ctx->i32, image->Register.Index, 0); - - if (images_writemask & (1 << image->Register.Index)) - dcc_off = true; - } else { - /* From the GL_ARB_shader_image_load_store extension spec: - * - * If a shader performs an image load, store, or atomic - * operation using an image variable declared as an array, - * and if the index used to select an individual element is - * negative or greater than or equal to the size of the - * array, the results of the operation are undefined but may - * not lead to termination. - */ - index = get_bounded_indirect_index(ctx, &image->Indirect, - image->Register.Index, - SI_NUM_IMAGES); - } - - *rsrc = load_image_desc(ctx, rsrc_ptr, index, target); - if (dcc_off && target != TGSI_TEXTURE_BUFFER) - *rsrc = force_dcc_off(ctx, *rsrc); -} - -static LLVMValueRef image_fetch_coords( - struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_instruction *inst, - unsigned src) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - unsigned target = inst->Memory.Texture; - unsigned num_coords = tgsi_util_get_texture_coord_dim(target); - LLVMValueRef coords[4]; - LLVMValueRef tmp; - int chan; - - for (chan = 0; chan < num_coords; ++chan) { - tmp = lp_build_emit_fetch(bld_base, inst, src, chan); - tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); - coords[chan] = tmp; - } - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->b.chip_class >= GFX9) { - if (target == TGSI_TEXTURE_1D) { - coords[1] = bld_base->uint_bld.zero; - num_coords++; - } else if (target == TGSI_TEXTURE_1D_ARRAY) { - coords[2] = coords[1]; - coords[1] = bld_base->uint_bld.zero; - } - } - - if (num_coords == 1) - return coords[0]; - - if (num_coords == 3) { - /* LLVM has difficulties lowering 3-element vectors. */ - coords[3] = bld_base->uint_bld.undef; - num_coords = 4; - } - - return lp_build_gather_values(gallivm, coords, num_coords); -} - -/** - * Append the extra mode bits that are used by image load and store. - */ -static void image_append_args( - struct si_shader_context *ctx, - struct lp_build_emit_data * emit_data, - unsigned target, - bool atomic, - bool force_glc) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); - LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); - LLVMValueRef r128 = i1false; - LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false; - LLVMValueRef glc = - force_glc || - inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? - i1true : i1false; - LLVMValueRef slc = i1false; - LLVMValueRef lwe = i1false; - - if (atomic || (HAVE_LLVM <= 0x0309)) { - emit_data->args[emit_data->arg_count++] = r128; - emit_data->args[emit_data->arg_count++] = da; - if (!atomic) { - emit_data->args[emit_data->arg_count++] = glc; - } - emit_data->args[emit_data->arg_count++] = slc; - return; - } - - /* HAVE_LLVM >= 0x0400 */ - emit_data->args[emit_data->arg_count++] = glc; - emit_data->args[emit_data->arg_count++] = slc; - emit_data->args[emit_data->arg_count++] = lwe; - emit_data->args[emit_data->arg_count++] = da; -} - -/** - * Append the resource and indexing arguments for buffer intrinsics. - * - * \param rsrc the v4i32 buffer resource - * \param index index into the buffer (stride-based) - * \param offset byte offset into the buffer - */ -static void buffer_append_args( - struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data, - LLVMValueRef rsrc, - LLVMValueRef index, - LLVMValueRef offset, - bool atomic, - bool force_glc) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); - LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); - - emit_data->args[emit_data->arg_count++] = rsrc; - emit_data->args[emit_data->arg_count++] = index; /* vindex */ - emit_data->args[emit_data->arg_count++] = offset; /* voffset */ - if (!atomic) { - emit_data->args[emit_data->arg_count++] = - force_glc || - inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? - i1true : i1false; /* glc */ - } - emit_data->args[emit_data->arg_count++] = i1false; /* slc */ -} - -static void load_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - const struct tgsi_full_instruction * inst = emit_data->inst; - unsigned target = inst->Memory.Texture; - LLVMValueRef rsrc; - - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef offset; - LLVMValueRef tmp; - - rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]); - - tmp = lp_build_emit_fetch(bld_base, inst, 1, 0); - offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); - - buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, false, false); - } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { - LLVMValueRef coords; - - image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc); - coords = image_fetch_coords(bld_base, inst, 1); - - if (target == TGSI_TEXTURE_BUFFER) { - buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, false, false); - } else { - emit_data->args[0] = coords; - emit_data->args[1] = rsrc; - emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */ - emit_data->arg_count = 3; - - image_append_args(ctx, emit_data, target, false, false); - } - } -} - -static unsigned get_load_intr_attribs(bool readonly_memory) -{ - /* READNONE means writes can't affect it, while READONLY means that - * writes can affect it. */ - return readonly_memory && HAVE_LLVM >= 0x0400 ? - LP_FUNC_ATTR_READNONE : - LP_FUNC_ATTR_READONLY; -} - -static unsigned get_store_intr_attribs(bool writeonly_memory) -{ - return writeonly_memory && HAVE_LLVM >= 0x0400 ? - LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY : - LP_FUNC_ATTR_WRITEONLY; -} - -static void load_emit_buffer(struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data, - bool readonly_memory) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - uint writemask = inst->Dst[0].Register.WriteMask; - uint count = util_last_bit(writemask); - const char *intrinsic_name; - LLVMTypeRef dst_type; - - switch (count) { - case 1: - intrinsic_name = "llvm.amdgcn.buffer.load.f32"; - dst_type = ctx->f32; - break; - case 2: - intrinsic_name = "llvm.amdgcn.buffer.load.v2f32"; - dst_type = LLVMVectorType(ctx->f32, 2); - break; - default: // 3 & 4 - intrinsic_name = "llvm.amdgcn.buffer.load.v4f32"; - dst_type = ctx->v4f32; - count = 4; - } - - emit_data->output[emit_data->chan] = lp_build_intrinsic( - builder, intrinsic_name, dst_type, - emit_data->args, emit_data->arg_count, - get_load_intr_attribs(readonly_memory)); -} - -static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx, - const struct tgsi_full_instruction *inst, - LLVMTypeRef type, int arg) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef offset, ptr; - int addr_space; - - offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0); - offset = LLVMBuildBitCast(builder, offset, ctx->i32, ""); - - ptr = ctx->shared_memory; - ptr = LLVMBuildGEP(builder, ptr, &offset, 1, ""); - addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); - ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), ""); - - return ptr; -} - -static void load_emit_memory( - struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - struct lp_build_context *base = &ctx->bld_base.base; - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - unsigned writemask = inst->Dst[0].Register.WriteMask; - LLVMValueRef channels[4], ptr, derived_ptr, index; - int chan; - - ptr = get_memory_ptr(ctx, inst, base->elem_type, 1); - - for (chan = 0; chan < 4; ++chan) { - if (!(writemask & (1 << chan))) { - channels[chan] = LLVMGetUndef(base->elem_type); - continue; - } - - index = LLVMConstInt(ctx->i32, chan, 0); - derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); - channels[chan] = LLVMBuildLoad(builder, derived_ptr, ""); - } - emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4); -} - -/** - * Return true if the memory accessed by a LOAD or STORE instruction is - * read-only or write-only, respectively. - * - * \param shader_buffers_reverse_access_mask - * For LOAD, set this to (store | atomic) slot usage in the shader. - * For STORE, set this to (load | atomic) slot usage in the shader. - * \param images_reverse_access_mask Same as above, but for images. - */ -static bool is_oneway_access_only(const struct tgsi_full_instruction *inst, - const struct tgsi_shader_info *info, - unsigned shader_buffers_reverse_access_mask, - unsigned images_reverse_access_mask) -{ - /* RESTRICT means NOALIAS. - * If there are no writes, we can assume the accessed memory is read-only. - * If there are no reads, we can assume the accessed memory is write-only. - */ - if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) { - unsigned reverse_access_mask; - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - reverse_access_mask = shader_buffers_reverse_access_mask; - } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - reverse_access_mask = info->images_buffers & - images_reverse_access_mask; - } else { - reverse_access_mask = ~info->images_buffers & - images_reverse_access_mask; - } - - if (inst->Src[0].Register.Indirect) { - if (!reverse_access_mask) - return true; - } else { - if (!(reverse_access_mask & - (1u << inst->Src[0].Register.Index))) - return true; - } - } - - /* If there are no buffer writes (for both shader buffers & image - * buffers), it implies that buffer memory is read-only. - * If there are no buffer reads (for both shader buffers & image - * buffers), it implies that buffer memory is write-only. - * - * Same for the case when there are no writes/reads for non-buffer - * images. - */ - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || - (inst->Src[0].Register.File == TGSI_FILE_IMAGE && - inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { - if (!shader_buffers_reverse_access_mask && - !(info->images_buffers & images_reverse_access_mask)) - return true; - } else { - if (!(~info->images_buffers & images_reverse_access_mask)) - return true; - } - return false; -} - -static void load_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - const struct tgsi_shader_info *info = &ctx->shader->selector->info; - char intrinsic_name[64]; - bool readonly_memory = false; - - if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) { - load_emit_memory(ctx, emit_data); - return; - } - - if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - emit_waitcnt(ctx, VM_CNT); - - readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) && - is_oneway_access_only(inst, info, - info->shader_buffers_store | - info->shader_buffers_atomic, - info->images_store | - info->images_atomic); - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - load_emit_buffer(ctx, emit_data, readonly_memory); - return; - } - - if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - emit_data->output[emit_data->chan] = - lp_build_intrinsic( - builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, - emit_data->args, emit_data->arg_count, - get_load_intr_attribs(readonly_memory)); - } else { - ac_get_image_intr_name("llvm.amdgcn.image.load", - emit_data->dst_type, /* vdata */ - LLVMTypeOf(emit_data->args[0]), /* coords */ - LLVMTypeOf(emit_data->args[1]), /* rsrc */ - intrinsic_name, sizeof(intrinsic_name)); - - emit_data->output[emit_data->chan] = - lp_build_intrinsic( - builder, intrinsic_name, emit_data->dst_type, - emit_data->args, emit_data->arg_count, - get_load_intr_attribs(readonly_memory)); - } -} - -static void store_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - struct tgsi_full_src_register memory; - LLVMValueRef chans[4]; - LLVMValueRef data; - LLVMValueRef rsrc; - unsigned chan; - - emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context); - - for (chan = 0; chan < 4; ++chan) { - chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan); - } - data = lp_build_gather_values(gallivm, chans, 4); - - emit_data->args[emit_data->arg_count++] = data; - - memory = tgsi_full_src_register_from_dst(&inst->Dst[0]); - - if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { - LLVMValueRef offset; - LLVMValueRef tmp; - - rsrc = shader_buffer_fetch_rsrc(ctx, &memory); - - tmp = lp_build_emit_fetch(bld_base, inst, 0, 0); - offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); - - buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, false, false); - } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) { - unsigned target = inst->Memory.Texture; - LLVMValueRef coords; - - /* 8bit/16bit TC L1 write corruption bug on SI. - * All store opcodes not aligned to a dword are affected. - * - * The only way to get unaligned stores in radeonsi is through - * shader images. - */ - bool force_glc = ctx->screen->b.chip_class == SI; - - coords = image_fetch_coords(bld_base, inst, 0); - - if (target == TGSI_TEXTURE_BUFFER) { - image_fetch_rsrc(bld_base, &memory, true, target, &rsrc); - buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, false, force_glc); - } else { - emit_data->args[1] = coords; - image_fetch_rsrc(bld_base, &memory, true, target, - &emit_data->args[2]); - emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */ - emit_data->arg_count = 4; - - image_append_args(ctx, emit_data, target, false, force_glc); - } - } -} - -static void store_emit_buffer( - struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data, - bool writeonly_memory) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld; - LLVMValueRef base_data = emit_data->args[0]; - LLVMValueRef base_offset = emit_data->args[3]; - unsigned writemask = inst->Dst[0].Register.WriteMask; - - while (writemask) { - int start, count; - const char *intrinsic_name; - LLVMValueRef data; - LLVMValueRef offset; - LLVMValueRef tmp; - - u_bit_scan_consecutive_range(&writemask, &start, &count); - - /* Due to an LLVM limitation, split 3-element writes - * into a 2-element and a 1-element write. */ - if (count == 3) { - writemask |= 1 << (start + 2); - count = 2; - } - - if (count == 4) { - data = base_data; - intrinsic_name = "llvm.amdgcn.buffer.store.v4f32"; - } else if (count == 2) { - LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2); - - tmp = LLVMBuildExtractElement( - builder, base_data, - LLVMConstInt(ctx->i32, start, 0), ""); - data = LLVMBuildInsertElement( - builder, LLVMGetUndef(v2f32), tmp, - uint_bld->zero, ""); - - tmp = LLVMBuildExtractElement( - builder, base_data, - LLVMConstInt(ctx->i32, start + 1, 0), ""); - data = LLVMBuildInsertElement( - builder, data, tmp, uint_bld->one, ""); - - intrinsic_name = "llvm.amdgcn.buffer.store.v2f32"; - } else { - assert(count == 1); - data = LLVMBuildExtractElement( - builder, base_data, - LLVMConstInt(ctx->i32, start, 0), ""); - intrinsic_name = "llvm.amdgcn.buffer.store.f32"; - } - - offset = base_offset; - if (start != 0) { - offset = LLVMBuildAdd( - builder, offset, - LLVMConstInt(ctx->i32, start * 4, 0), ""); - } - - emit_data->args[0] = data; - emit_data->args[3] = offset; - - lp_build_intrinsic( - builder, intrinsic_name, emit_data->dst_type, - emit_data->args, emit_data->arg_count, - get_store_intr_attribs(writeonly_memory)); - } -} - -static void store_emit_memory( - struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data) -{ - const struct tgsi_full_instruction *inst = emit_data->inst; - struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_context *base = &ctx->bld_base.base; - LLVMBuilderRef builder = gallivm->builder; - unsigned writemask = inst->Dst[0].Register.WriteMask; - LLVMValueRef ptr, derived_ptr, data, index; - int chan; - - ptr = get_memory_ptr(ctx, inst, base->elem_type, 0); - - for (chan = 0; chan < 4; ++chan) { - if (!(writemask & (1 << chan))) { - continue; - } - data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan); - index = LLVMConstInt(ctx->i32, chan, 0); - derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); - LLVMBuildStore(builder, data, derived_ptr); - } -} - -static void store_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - const struct tgsi_shader_info *info = &ctx->shader->selector->info; - unsigned target = inst->Memory.Texture; - char intrinsic_name[64]; - bool writeonly_memory = false; - - if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) { - store_emit_memory(ctx, emit_data); - return; - } - - if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - emit_waitcnt(ctx, VM_CNT); - - writeonly_memory = is_oneway_access_only(inst, info, - info->shader_buffers_load | - info->shader_buffers_atomic, - info->images_load | - info->images_atomic); - - if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { - store_emit_buffer(ctx, emit_data, writeonly_memory); - return; - } - - if (target == TGSI_TEXTURE_BUFFER) { - emit_data->output[emit_data->chan] = lp_build_intrinsic( - builder, "llvm.amdgcn.buffer.store.format.v4f32", - emit_data->dst_type, emit_data->args, - emit_data->arg_count, - get_store_intr_attribs(writeonly_memory)); - } else { - ac_get_image_intr_name("llvm.amdgcn.image.store", - LLVMTypeOf(emit_data->args[0]), /* vdata */ - LLVMTypeOf(emit_data->args[1]), /* coords */ - LLVMTypeOf(emit_data->args[2]), /* rsrc */ - intrinsic_name, sizeof(intrinsic_name)); - - emit_data->output[emit_data->chan] = - lp_build_intrinsic( - builder, intrinsic_name, emit_data->dst_type, - emit_data->args, emit_data->arg_count, - get_store_intr_attribs(writeonly_memory)); - } -} - -static void atomic_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - LLVMValueRef data1, data2; - LLVMValueRef rsrc; - LLVMValueRef tmp; - - emit_data->dst_type = bld_base->base.elem_type; - - tmp = lp_build_emit_fetch(bld_base, inst, 2, 0); - data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); - - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { - tmp = lp_build_emit_fetch(bld_base, inst, 3, 0); - data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); - } - - /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order - * of arguments, which is reversed relative to TGSI (and GLSL) - */ - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) - emit_data->args[emit_data->arg_count++] = data2; - emit_data->args[emit_data->arg_count++] = data1; - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - LLVMValueRef offset; - - rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]); - - tmp = lp_build_emit_fetch(bld_base, inst, 1, 0); - offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); - - buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, true, false); - } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { - unsigned target = inst->Memory.Texture; - LLVMValueRef coords; - - image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc); - coords = image_fetch_coords(bld_base, inst, 1); - - if (target == TGSI_TEXTURE_BUFFER) { - buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, true, false); - } else { - emit_data->args[emit_data->arg_count++] = coords; - emit_data->args[emit_data->arg_count++] = rsrc; - - image_append_args(ctx, emit_data, target, true, false); - } - } -} - -static void atomic_emit_memory(struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - LLVMValueRef ptr, result, arg; - - ptr = get_memory_ptr(ctx, inst, ctx->i32, 1); - - arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0); - arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); - - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { - LLVMValueRef new_data; - new_data = lp_build_emit_fetch(&ctx->bld_base, - inst, 3, 0); - - new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, ""); - -#if HAVE_LLVM >= 0x309 - result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data, - LLVMAtomicOrderingSequentiallyConsistent, - LLVMAtomicOrderingSequentiallyConsistent, - false); -#endif - - result = LLVMBuildExtractValue(builder, result, 0, ""); - } else { - LLVMAtomicRMWBinOp op; - - switch(inst->Instruction.Opcode) { - case TGSI_OPCODE_ATOMUADD: - op = LLVMAtomicRMWBinOpAdd; - break; - case TGSI_OPCODE_ATOMXCHG: - op = LLVMAtomicRMWBinOpXchg; - break; - case TGSI_OPCODE_ATOMAND: - op = LLVMAtomicRMWBinOpAnd; - break; - case TGSI_OPCODE_ATOMOR: - op = LLVMAtomicRMWBinOpOr; - break; - case TGSI_OPCODE_ATOMXOR: - op = LLVMAtomicRMWBinOpXor; - break; - case TGSI_OPCODE_ATOMUMIN: - op = LLVMAtomicRMWBinOpUMin; - break; - case TGSI_OPCODE_ATOMUMAX: - op = LLVMAtomicRMWBinOpUMax; - break; - case TGSI_OPCODE_ATOMIMIN: - op = LLVMAtomicRMWBinOpMin; - break; - case TGSI_OPCODE_ATOMIMAX: - op = LLVMAtomicRMWBinOpMax; - break; - default: - unreachable("unknown atomic opcode"); - } - - result = LLVMBuildAtomicRMW(builder, op, ptr, arg, - LLVMAtomicOrderingSequentiallyConsistent, - false); - } - emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, ""); -} - -static void atomic_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; - char intrinsic_name[40]; - LLVMValueRef tmp; - - if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) { - atomic_emit_memory(ctx, emit_data); - return; - } - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || - inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.buffer.atomic.%s", action->intr_name); - } else { - LLVMValueRef coords; - char coords_type[8]; - - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) - coords = emit_data->args[2]; - else - coords = emit_data->args[1]; - - ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type)); - snprintf(intrinsic_name, sizeof(intrinsic_name), - "llvm.amdgcn.image.atomic.%s.%s", - action->intr_name, coords_type); - } - - tmp = lp_build_intrinsic( - builder, intrinsic_name, bld_base->uint_bld.elem_type, - emit_data->args, emit_data->arg_count, 0); - emit_data->output[emit_data->chan] = - LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, ""); -} - -static void set_tex_fetch_args(struct si_shader_context *ctx, - struct lp_build_emit_data *emit_data, - unsigned target, - LLVMValueRef res_ptr, LLVMValueRef samp_ptr, - LLVMValueRef *param, unsigned count, - unsigned dmask) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - struct ac_image_args args = {}; - - /* Pad to power of two vector */ - while (count < util_next_power_of_two(count)) - param[count++] = LLVMGetUndef(ctx->i32); - - if (count > 1) - args.addr = lp_build_gather_values(gallivm, param, count); - else - args.addr = param[0]; - - args.resource = res_ptr; - args.sampler = samp_ptr; - args.dmask = dmask; - args.unorm = target == TGSI_TEXTURE_RECT || - target == TGSI_TEXTURE_SHADOWRECT; - args.da = tgsi_is_array_sampler(target); - - /* Ugly, but we seem to have no other choice right now. */ - STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args)); - memcpy(emit_data->args, &args, sizeof(args)); -} - -static LLVMValueRef fix_resinfo(struct si_shader_context *ctx, - unsigned target, LLVMValueRef out) -{ - LLVMBuilderRef builder = ctx->gallivm.builder; - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->b.chip_class >= GFX9 && - (target == TGSI_TEXTURE_1D_ARRAY || - target == TGSI_TEXTURE_SHADOW1D_ARRAY)) { - LLVMValueRef layers = - LLVMBuildExtractElement(builder, out, - LLVMConstInt(ctx->i32, 2, 0), ""); - out = LLVMBuildInsertElement(builder, out, layers, - LLVMConstInt(ctx->i32, 1, 0), ""); - } - - /* Divide the number of layers by 6 to get the number of cubes. */ - if (target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { - LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0); - - LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, ""); - z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), ""); - - out = LLVMBuildInsertElement(builder, out, z, imm2, ""); - } - return out; -} - -static void resq_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction *inst = emit_data->inst; - const struct tgsi_full_src_register *reg = &inst->Src[0]; - - emit_data->dst_type = ctx->v4i32; - - if (reg->Register.File == TGSI_FILE_BUFFER) { - emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg); - emit_data->arg_count = 1; - } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, - &emit_data->args[0]); - emit_data->arg_count = 1; - } else { - LLVMValueRef res_ptr; - unsigned image_target; - - if (inst->Memory.Texture == TGSI_TEXTURE_3D) - image_target = TGSI_TEXTURE_2D_ARRAY; - else - image_target = inst->Memory.Texture; - - image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, - &res_ptr); - set_tex_fetch_args(ctx, emit_data, image_target, - res_ptr, NULL, &bld_base->uint_bld.zero, 1, - 0xf); - } -} - -static void resq_emit( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction *inst = emit_data->inst; - LLVMValueRef out; - - if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { - out = LLVMBuildExtractElement(builder, emit_data->args[0], - LLVMConstInt(ctx->i32, 2, 0), ""); - } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { - out = get_buffer_size(bld_base, emit_data->args[0]); - } else { - struct ac_image_args args; - - memcpy(&args, emit_data->args, sizeof(args)); /* ugly */ - args.opcode = ac_image_get_resinfo; - out = ac_build_image_opcode(&ctx->ac, &args); - - out = fix_resinfo(ctx, inst->Memory.Texture, out); - } - - emit_data->output[emit_data->chan] = out; -} - -static const struct lp_build_tgsi_action tex_action; - -enum desc_type { - DESC_IMAGE, - DESC_BUFFER, - DESC_FMASK, - DESC_SAMPLER, -}; - -/** - * Load an image view, fmask view. or sampler state descriptor. - */ -static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum desc_type type) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - - switch (type) { - case DESC_IMAGE: - /* The image is at [0:7]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); - break; - case DESC_BUFFER: - /* The buffer is in [4:7]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); - index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), ""); - list = LLVMBuildPointerCast(builder, list, - const_array(ctx->v4i32, 0), ""); - break; - case DESC_FMASK: - /* The FMASK is at [8:15]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); - index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), ""); - break; - case DESC_SAMPLER: - /* The sampler state is at [12:15]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); - index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), ""); - list = LLVMBuildPointerCast(builder, list, - const_array(ctx->v4i32, 0), ""); - break; - } - - return ac_build_indexed_load_const(&ctx->ac, list, index); -} - -/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. - * - * SI-CI: - * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic - * filtering manually. The driver sets img7 to a mask clearing - * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do: - * s_and_b32 samp0, samp0, img7 - * - * VI: - * The ANISO_OVERRIDE sampler field enables this fix in TA. - */ -static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx, - LLVMValueRef res, LLVMValueRef samp) -{ - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMValueRef img7, samp0; - - if (ctx->screen->b.chip_class >= VI) - return samp; - - img7 = LLVMBuildExtractElement(builder, res, - LLVMConstInt(ctx->i32, 7, 0), ""); - samp0 = LLVMBuildExtractElement(builder, samp, - LLVMConstInt(ctx->i32, 0, 0), ""); - samp0 = LLVMBuildAnd(builder, samp0, img7, ""); - return LLVMBuildInsertElement(builder, samp, samp0, - LLVMConstInt(ctx->i32, 0, 0), ""); -} - -static void tex_fetch_ptrs( - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data, - LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef list = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLERS); - const struct tgsi_full_instruction *inst = emit_data->inst; - const struct tgsi_full_src_register *reg; - unsigned target = inst->Texture.Texture; - unsigned sampler_src; - LLVMValueRef index; - - sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1; - reg = &emit_data->inst->Src[sampler_src]; - - if (reg->Register.Indirect) { - index = get_bounded_indirect_index(ctx, - ®->Indirect, - reg->Register.Index, - SI_NUM_SAMPLERS); - } else { - index = LLVMConstInt(ctx->i32, reg->Register.Index, 0); - } - - if (target == TGSI_TEXTURE_BUFFER) - *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER); - else - *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE); - - if (samp_ptr) - *samp_ptr = NULL; - if (fmask_ptr) - *fmask_ptr = NULL; - - if (target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA) { - if (fmask_ptr) - *fmask_ptr = load_sampler_desc(ctx, list, index, - DESC_FMASK); - } else if (target != TGSI_TEXTURE_BUFFER) { - if (samp_ptr) { - *samp_ptr = load_sampler_desc(ctx, list, index, - DESC_SAMPLER); - *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); - } - } -} - -static void txq_fetch_args( - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction *inst = emit_data->inst; - unsigned target = inst->Texture.Texture; - LLVMValueRef res_ptr; - LLVMValueRef address; - - tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL); - - if (target == TGSI_TEXTURE_BUFFER) { - /* Read the size from the buffer descriptor directly. */ - emit_data->args[0] = get_buffer_size(bld_base, res_ptr); - return; - } - - /* Textures - set the mip level. */ - address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X); - - set_tex_fetch_args(ctx, emit_data, target, res_ptr, - NULL, &address, 1, 0xf); -} - -static void txq_emit(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct ac_image_args args; - unsigned target = emit_data->inst->Texture.Texture; - - if (target == TGSI_TEXTURE_BUFFER) { - /* Just return the buffer size. */ - emit_data->output[emit_data->chan] = emit_data->args[0]; - return; - } - - memcpy(&args, emit_data->args, sizeof(args)); /* ugly */ - - args.opcode = ac_image_get_resinfo; - LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args); - - emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result); -} - -static void tex_fetch_args( - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - const struct tgsi_full_instruction *inst = emit_data->inst; - unsigned opcode = inst->Instruction.Opcode; - unsigned target = inst->Texture.Texture; - LLVMValueRef coords[5], derivs[6]; - LLVMValueRef address[16]; - unsigned num_coords = tgsi_util_get_texture_coord_dim(target); - int ref_pos = tgsi_util_get_shadow_ref_src_index(target); - unsigned count = 0; - unsigned chan; - unsigned num_deriv_channels = 0; - bool has_offset = inst->Texture.NumOffsets > 0; - LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; - unsigned dmask = 0xf; - - tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); - - if (target == TGSI_TEXTURE_BUFFER) { - emit_data->dst_type = ctx->v4f32; - emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr, - ctx->v16i8, ""); - emit_data->args[1] = bld_base->uint_bld.zero; - emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); - emit_data->arg_count = 3; - return; - } - - /* Fetch and project texture coordinates */ - coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W); - for (chan = 0; chan < 3; chan++ ) { - coords[chan] = lp_build_emit_fetch(bld_base, - emit_data->inst, 0, - chan); - if (opcode == TGSI_OPCODE_TXP) - coords[chan] = lp_build_emit_llvm_binary(bld_base, - TGSI_OPCODE_DIV, - coords[chan], - coords[3]); - } - - if (opcode == TGSI_OPCODE_TXP) - coords[3] = bld_base->base.one; - - /* Pack offsets. */ - if (has_offset && - opcode != TGSI_OPCODE_TXF && - opcode != TGSI_OPCODE_TXF_LZ) { - /* The offsets are six-bit signed integers packed like this: - * X=[5:0], Y=[13:8], and Z=[21:16]. - */ - LLVMValueRef offset[3], pack; - - assert(inst->Texture.NumOffsets == 1); - - for (chan = 0; chan < 3; chan++) { - offset[chan] = lp_build_emit_fetch_texoffset(bld_base, - emit_data->inst, 0, chan); - offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan], - LLVMConstInt(ctx->i32, 0x3f, 0), ""); - if (chan) - offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan], - LLVMConstInt(ctx->i32, chan*8, 0), ""); - } - - pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], ""); - pack = LLVMBuildOr(gallivm->builder, pack, offset[2], ""); - address[count++] = pack; - } - - /* Pack LOD bias value */ - if (opcode == TGSI_OPCODE_TXB) - address[count++] = coords[3]; - if (opcode == TGSI_OPCODE_TXB2) - address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - - /* Pack depth comparison value */ - if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) { - LLVMValueRef z; - - if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { - z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - } else { - assert(ref_pos >= 0); - z = coords[ref_pos]; - } - - /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT, - * so the depth comparison value isn't clamped for Z16 and - * Z24 anymore. Do it manually here. - * - * It's unnecessary if the original texture format was - * Z32_FLOAT, but we don't know that here. - */ - if (ctx->screen->b.chip_class == VI) - z = ac_build_clamp(&ctx->ac, z); - - address[count++] = z; - } - - /* Pack user derivatives */ - if (opcode == TGSI_OPCODE_TXD) { - int param, num_src_deriv_channels, num_dst_deriv_channels; - - switch (target) { - case TGSI_TEXTURE_3D: - num_src_deriv_channels = 3; - num_dst_deriv_channels = 3; - num_deriv_channels = 3; - break; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW2D_ARRAY: - num_src_deriv_channels = 2; - num_dst_deriv_channels = 2; - num_deriv_channels = 2; - break; - case TGSI_TEXTURE_CUBE: - case TGSI_TEXTURE_SHADOWCUBE: - case TGSI_TEXTURE_CUBE_ARRAY: - case TGSI_TEXTURE_SHADOWCUBE_ARRAY: - /* Cube derivatives will be converted to 2D. */ - num_src_deriv_channels = 3; - num_dst_deriv_channels = 3; - num_deriv_channels = 2; - break; - case TGSI_TEXTURE_1D: - case TGSI_TEXTURE_SHADOW1D: - case TGSI_TEXTURE_1D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - num_src_deriv_channels = 1; - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->b.chip_class >= GFX9) { - num_dst_deriv_channels = 2; - num_deriv_channels = 2; - } else { - num_dst_deriv_channels = 1; - num_deriv_channels = 1; - } - break; - default: - unreachable("invalid target"); - } - - for (param = 0; param < 2; param++) { - for (chan = 0; chan < num_src_deriv_channels; chan++) - derivs[param * num_dst_deriv_channels + chan] = - lp_build_emit_fetch(bld_base, inst, param+1, chan); - - /* Fill in the rest with zeros. */ - for (chan = num_src_deriv_channels; - chan < num_dst_deriv_channels; chan++) - derivs[param * num_dst_deriv_channels + chan] = - bld_base->base.zero; - } - } - - if (target == TGSI_TEXTURE_CUBE || - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) - ac_prepare_cube_coords(&ctx->ac, - opcode == TGSI_OPCODE_TXD, - target == TGSI_TEXTURE_CUBE_ARRAY || - target == TGSI_TEXTURE_SHADOWCUBE_ARRAY, - coords, derivs); - - if (opcode == TGSI_OPCODE_TXD) - for (int i = 0; i < num_deriv_channels * 2; i++) - address[count++] = derivs[i]; - - /* Pack texture coordinates */ - address[count++] = coords[0]; - if (num_coords > 1) - address[count++] = coords[1]; - if (num_coords > 2) - address[count++] = coords[2]; - - /* 1D textures are allocated and used as 2D on GFX9. */ - if (ctx->screen->b.chip_class >= GFX9) { - LLVMValueRef filler; - - /* Use 0.5, so that we don't sample the border color. */ - if (opcode == TGSI_OPCODE_TXF) - filler = bld_base->uint_bld.zero; - else - filler = LLVMConstReal(ctx->f32, 0.5); - - if (target == TGSI_TEXTURE_1D || - target == TGSI_TEXTURE_SHADOW1D) { - address[count++] = filler; - } else if (target == TGSI_TEXTURE_1D_ARRAY || - target == TGSI_TEXTURE_SHADOW1D_ARRAY) { - address[count] = address[count - 1]; - address[count - 1] = filler; - count++; - } - } - - /* Pack LOD or sample index */ - if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF) - address[count++] = coords[3]; - else if (opcode == TGSI_OPCODE_TXL2) - address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - - if (count > 16) { - assert(!"Cannot handle more than 16 texture address parameters"); - count = 16; - } - - for (chan = 0; chan < count; chan++ ) { - address[chan] = LLVMBuildBitCast(gallivm->builder, - address[chan], ctx->i32, ""); - } - - /* Adjust the sample index according to FMASK. - * - * For uncompressed MSAA surfaces, FMASK should return 0x76543210, - * which is the identity mapping. Each nibble says which physical sample - * should be fetched to get that sample. - * - * For example, 0x11111100 means there are only 2 samples stored and - * the second sample covers 3/4 of the pixel. When reading samples 0 - * and 1, return physical sample 0 (determined by the first two 0s - * in FMASK), otherwise return physical sample 1. - * - * The sample index should be adjusted as follows: - * sample_index = (fmask >> (sample_index * 4)) & 0xF; - */ - if (target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA) { - struct lp_build_context *uint_bld = &bld_base->uint_bld; - struct lp_build_emit_data txf_emit_data = *emit_data; - LLVMValueRef txf_address[4]; - /* We only need .xy for non-arrays, and .xyz for arrays. */ - unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3; - struct tgsi_full_instruction inst = {}; - - memcpy(txf_address, address, sizeof(txf_address)); - - /* Read FMASK using TXF_LZ. */ - inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ; - inst.Texture.Texture = target; - txf_emit_data.inst = &inst; - txf_emit_data.chan = 0; - set_tex_fetch_args(ctx, &txf_emit_data, - target, fmask_ptr, NULL, - txf_address, txf_count, 0xf); - build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data); - - /* Initialize some constants. */ - LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0); - LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0); - - /* Apply the formula. */ - LLVMValueRef fmask = - LLVMBuildExtractElement(gallivm->builder, - txf_emit_data.output[0], - uint_bld->zero, ""); - - unsigned sample_chan = txf_count; /* the sample index is last */ - - LLVMValueRef sample_index4 = - LLVMBuildMul(gallivm->builder, address[sample_chan], four, ""); - - LLVMValueRef shifted_fmask = - LLVMBuildLShr(gallivm->builder, fmask, sample_index4, ""); - - LLVMValueRef final_sample = - LLVMBuildAnd(gallivm->builder, shifted_fmask, F, ""); - - /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK - * resource descriptor is 0 (invalid), - */ - LLVMValueRef fmask_desc = - LLVMBuildBitCast(gallivm->builder, fmask_ptr, - ctx->v8i32, ""); - - LLVMValueRef fmask_word1 = - LLVMBuildExtractElement(gallivm->builder, fmask_desc, - uint_bld->one, ""); - - LLVMValueRef word1_is_nonzero = - LLVMBuildICmp(gallivm->builder, LLVMIntNE, - fmask_word1, uint_bld->zero, ""); - - /* Replace the MSAA sample index. */ - address[sample_chan] = - LLVMBuildSelect(gallivm->builder, word1_is_nonzero, - final_sample, address[sample_chan], ""); - } - - if (opcode == TGSI_OPCODE_TXF || - opcode == TGSI_OPCODE_TXF_LZ) { - /* add tex offsets */ - if (inst->Texture.NumOffsets) { - struct lp_build_context *uint_bld = &bld_base->uint_bld; - const struct tgsi_texture_offset *off = inst->TexOffsets; - - assert(inst->Texture.NumOffsets == 1); - - switch (target) { - case TGSI_TEXTURE_3D: - address[2] = lp_build_add(uint_bld, address[2], - ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]); - /* fall through */ - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW2D_ARRAY: - address[1] = - lp_build_add(uint_bld, address[1], - ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]); - /* fall through */ - case TGSI_TEXTURE_1D: - case TGSI_TEXTURE_SHADOW1D: - case TGSI_TEXTURE_1D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - address[0] = - lp_build_add(uint_bld, address[0], - ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]); - break; - /* texture offsets do not apply to other texture targets */ - } - } - } - - if (opcode == TGSI_OPCODE_TG4) { - unsigned gather_comp = 0; - - /* DMASK was repurposed for GATHER4. 4 components are always - * returned and DMASK works like a swizzle - it selects - * the component to fetch. The only valid DMASK values are - * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - * (red,red,red,red) etc.) The ISA document doesn't mention - * this. - */ - - /* Get the component index from src1.x for Gather4. */ - if (!tgsi_is_shadow_target(target)) { - LLVMValueRef comp_imm; - struct tgsi_src_register src1 = inst->Src[1].Register; - - assert(src1.File == TGSI_FILE_IMMEDIATE); - - comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX]; - gather_comp = LLVMConstIntGetZExtValue(comp_imm); - gather_comp = CLAMP(gather_comp, 0, 3); - } - - dmask = 1 << gather_comp; - } - - set_tex_fetch_args(ctx, emit_data, target, res_ptr, - samp_ptr, address, count, dmask); -} - -/* Gather4 should follow the same rules as bilinear filtering, but the hardware - * incorrectly forces nearest filtering if the texture format is integer. - * The only effect it has on Gather4, which always returns 4 texels for - * bilinear filtering, is that the final coordinates are off by 0.5 of - * the texel size. - * - * The workaround is to subtract 0.5 from the unnormalized coordinates, - * or (0.5 / size) from the normalized coordinates. - */ -static void si_lower_gather4_integer(struct si_shader_context *ctx, - struct ac_image_args *args, - unsigned target) -{ - LLVMBuilderRef builder = ctx->gallivm.builder; - LLVMValueRef coord = args->addr; - LLVMValueRef half_texel[2]; - /* Texture coordinates start after: - * {offset, bias, z-compare, derivatives} - * Only the offset and z-compare can occur here. - */ - unsigned coord_vgpr_index = (int)args->offset + (int)args->compare; - int c; - - if (target == TGSI_TEXTURE_RECT || - target == TGSI_TEXTURE_SHADOWRECT) { - half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5); - } else { - struct tgsi_full_instruction txq_inst = {}; - struct lp_build_emit_data txq_emit_data = {}; - - /* Query the texture size. */ - txq_inst.Texture.Texture = target; - txq_emit_data.inst = &txq_inst; - txq_emit_data.dst_type = ctx->v4i32; - set_tex_fetch_args(ctx, &txq_emit_data, target, - args->resource, NULL, - &ctx->bld_base.uint_bld.zero, - 1, 0xf); - txq_emit(NULL, &ctx->bld_base, &txq_emit_data); - - /* Compute -0.5 / size. */ - for (c = 0; c < 2; c++) { - half_texel[c] = - LLVMBuildExtractElement(builder, txq_emit_data.output[0], - LLVMConstInt(ctx->i32, c, 0), ""); - half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, ""); - half_texel[c] = - lp_build_emit_llvm_unary(&ctx->bld_base, - TGSI_OPCODE_RCP, half_texel[c]); - half_texel[c] = LLVMBuildFMul(builder, half_texel[c], - LLVMConstReal(ctx->f32, -0.5), ""); - } - } - - for (c = 0; c < 2; c++) { - LLVMValueRef tmp; - LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0); - - tmp = LLVMBuildExtractElement(builder, coord, index, ""); - tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, ""); - tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], ""); - tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, ""); - coord = LLVMBuildInsertElement(builder, coord, tmp, index, ""); - } - - args->addr = coord; -} - -static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) -{ - struct si_shader_context *ctx = si_shader_context(bld_base); - const struct tgsi_full_instruction *inst = emit_data->inst; - struct ac_image_args args; - unsigned opcode = inst->Instruction.Opcode; - unsigned target = inst->Texture.Texture; - - if (target == TGSI_TEXTURE_BUFFER) { - emit_data->output[emit_data->chan] = - ac_build_buffer_load_format(&ctx->ac, - emit_data->args[0], - emit_data->args[2], - emit_data->args[1], - true); - return; - } - - memcpy(&args, emit_data->args, sizeof(args)); /* ugly */ - - args.opcode = ac_image_sample; - args.compare = tgsi_is_shadow_target(target); - args.offset = inst->Texture.NumOffsets > 0; - - switch (opcode) { - case TGSI_OPCODE_TXF: - case TGSI_OPCODE_TXF_LZ: - args.opcode = opcode == TGSI_OPCODE_TXF_LZ || - target == TGSI_TEXTURE_2D_MSAA || - target == TGSI_TEXTURE_2D_ARRAY_MSAA ? - ac_image_load : ac_image_load_mip; - args.compare = false; - args.offset = false; - break; - case TGSI_OPCODE_LODQ: - args.opcode = ac_image_get_lod; - args.compare = false; - args.offset = false; - break; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TEX2: - case TGSI_OPCODE_TXP: - if (ctx->type != PIPE_SHADER_FRAGMENT) - args.level_zero = true; - break; - case TGSI_OPCODE_TEX_LZ: - args.level_zero = true; - break; - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXB2: - assert(ctx->type == PIPE_SHADER_FRAGMENT); - args.bias = true; - break; - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXL2: - args.lod = true; - break; - case TGSI_OPCODE_TXD: - args.deriv = true; - break; - case TGSI_OPCODE_TG4: - args.opcode = ac_image_gather4; - args.level_zero = true; - break; - default: - assert(0); - return; - } - - /* The hardware needs special lowering for Gather4 with integer formats. */ - if (ctx->screen->b.chip_class <= VI && - opcode == TGSI_OPCODE_TG4) { - struct tgsi_shader_info *info = &ctx->shader->selector->info; - /* This will also work with non-constant indexing because of how - * glsl_to_tgsi works and we intent to preserve that behavior. - */ - const unsigned src_idx = 2; - unsigned sampler = inst->Src[src_idx].Register.Index; - - assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER); - - if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT || - info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT) - si_lower_gather4_integer(ctx, &args, target); - } - - emit_data->output[emit_data->chan] = - ac_build_image_opcode(&ctx->ac, &args); -} - -static void si_llvm_emit_txqs( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef res, samples; - LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; - - tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); + LLVMValueRef tmp; + tmp = lp_build_intrinsic(ctx->ac.builder, "llvm.readcyclecounter", + ctx->i64, NULL, 0, 0); + tmp = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->v2i32, ""); - /* Read the samples from the descriptor directly. */ - res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); - samples = LLVMBuildExtractElement( - builder, res, - LLVMConstInt(ctx->i32, 3, 0), ""); - samples = LLVMBuildLShr(builder, samples, - LLVMConstInt(ctx->i32, 16, 0), ""); - samples = LLVMBuildAnd(builder, samples, - LLVMConstInt(ctx->i32, 0xf, 0), ""); - samples = LLVMBuildShl(builder, LLVMConstInt(ctx->i32, 1, 0), - samples, ""); + emit_data->output[0] = + LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, ""); + emit_data->output[1] = + LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, ""); +} - emit_data->output[emit_data->chan] = samples; +LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements) +{ + return LLVMPointerType(LLVMArrayType(elem_type, num_elements), + CONST_ADDR_SPACE); } static void si_llvm_emit_ddxy( @@ -4915,7 +3720,6 @@ static void si_llvm_emit_ddxy( struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; unsigned opcode = emit_data->info->opcode; LLVMValueRef val; int idx; @@ -4931,9 +3735,8 @@ static void si_llvm_emit_ddxy( /* for DDX we want to next X pixel, DDY next Y pixel. */ idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; - val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, ""); - val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute, - mask, idx, ctx->lds, val); + val = ac_to_integer(&ctx->ac, emit_data->args[0]); + val = ac_build_ddxy(&ctx->ac, mask, idx, val); emit_data->output[emit_data->chan] = val; } @@ -4947,18 +3750,17 @@ static LLVMValueRef si_llvm_emit_ddxy_interp( LLVMValueRef interp_ij) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef result[4], a; unsigned i; for (i = 0; i < 2; i++) { - a = LLVMBuildExtractElement(gallivm->builder, interp_ij, + a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij, LLVMConstInt(ctx->i32, i, 0), ""); result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a); result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a); } - return lp_build_gather_values(gallivm, result, 4); + return lp_build_gather_values(&ctx->gallivm, result, 4); } static void interp_fetch_args( @@ -4966,7 +3768,6 @@ static void interp_fetch_args( struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; const struct tgsi_full_instruction *inst = emit_data->inst; if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { @@ -4988,19 +3789,44 @@ static void interp_fetch_args( */ sample_id = lp_build_emit_fetch(bld_base, emit_data->inst, 1, TGSI_CHAN_X); - sample_id = LLVMBuildBitCast(gallivm->builder, sample_id, - ctx->i32, ""); - sample_position = load_sample_position(ctx, sample_id); + sample_id = ac_to_integer(&ctx->ac, sample_id); + + /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading + * Language 4.50 spec says about interpolateAtSample: + * + * "Returns the value of the input interpolant variable at + * the location of sample number sample. If multisample + * buffers are not available, the input variable will be + * evaluated at the center of the pixel. If sample sample + * does not exist, the position used to interpolate the + * input variable is undefined." + * + * This means that sample_id values outside of the valid are + * in fact valid input, and the usual mechanism for loading the + * sample position doesn't work. + */ + if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) { + LLVMValueRef center[4] = { + LLVMConstReal(ctx->f32, 0.5), + LLVMConstReal(ctx->f32, 0.5), + ctx->ac.f32_0, + ctx->ac.f32_0, + }; + + sample_position = lp_build_gather_values(&ctx->gallivm, center, 4); + } else { + sample_position = load_sample_position(ctx, sample_id); + } - emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder, + emit_data->args[0] = LLVMBuildExtractElement(ctx->ac.builder, sample_position, - LLVMConstInt(ctx->i32, 0, 0), ""); + ctx->i32_0, ""); - emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, ""); - emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder, + emit_data->args[0] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[0], halfval, ""); + emit_data->args[1] = LLVMBuildExtractElement(ctx->ac.builder, sample_position, - LLVMConstInt(ctx->i32, 1, 0), ""); - emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, ""); + ctx->i32_1, ""); + emit_data->args[1] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[1], halfval, ""); emit_data->arg_count = 2; } } @@ -5011,20 +3837,41 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, { struct si_shader_context *ctx = si_shader_context(bld_base); struct si_shader *shader = ctx->shader; - struct gallivm_state *gallivm = bld_base->base.gallivm; - struct lp_build_context *uint = &bld_base->uint_bld; + const struct tgsi_shader_info *info = &shader->selector->info; LLVMValueRef interp_param; const struct tgsi_full_instruction *inst = emit_data->inst; - int input_index = inst->Src[0].Register.Index; + const struct tgsi_full_src_register *input = &inst->Src[0]; + int input_base, input_array_size; int chan; int i; - LLVMValueRef attr_number; - LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK); + LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK); + LLVMValueRef array_idx; int interp_param_idx; - unsigned interp = shader->selector->info.input_interpolate[input_index]; + unsigned interp; unsigned location; - assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); + assert(input->Register.File == TGSI_FILE_INPUT); + + if (input->Register.Indirect) { + unsigned array_id = input->Indirect.ArrayID; + + if (array_id) { + input_base = info->input_array_first[array_id]; + input_array_size = info->input_array_last[array_id] - input_base + 1; + } else { + input_base = inst->Src[0].Register.Index; + input_array_size = info->num_inputs - input_base; + } + + array_idx = si_get_indirect_index(ctx, &input->Indirect, + 1, input->Register.Index - input_base); + } else { + input_base = inst->Src[0].Register.Index; + input_array_size = 1; + array_idx = ctx->i32_0; + } + + interp = shader->selector->info.input_interpolate[input_base]; if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) @@ -5040,8 +3887,6 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, else interp_param = NULL; - attr_number = LLVMConstInt(ctx->i32, input_index, 0); - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { LLVMValueRef ij_out[2]; @@ -5058,129 +3903,141 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, for (i = 0; i < 2; i++) { LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0); LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0); - LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder, + LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, ix_ll, ""); - LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder, + LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, iy_ll, ""); - LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder, + LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ix_ll, ""); LLVMValueRef temp1, temp2; - interp_el = LLVMBuildBitCast(gallivm->builder, interp_el, - ctx->f32, ""); + interp_el = ac_to_float(&ctx->ac, interp_el); - temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], ""); + temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, emit_data->args[0], ""); - temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, ""); + temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, ""); - temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], ""); + temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, emit_data->args[1], ""); - ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, ""); + ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, ""); } - interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2); + interp_param = lp_build_gather_values(&ctx->gallivm, ij_out, 2); } + if (interp_param) + interp_param = ac_to_float(&ctx->ac, interp_param); + for (chan = 0; chan < 4; chan++) { - LLVMValueRef llvm_chan; - unsigned schan; - - schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); - llvm_chan = LLVMConstInt(ctx->i32, schan, 0); - - if (interp_param) { - interp_param = LLVMBuildBitCast(gallivm->builder, - interp_param, LLVMVectorType(ctx->f32, 2), ""); - LLVMValueRef i = LLVMBuildExtractElement( - gallivm->builder, interp_param, uint->zero, ""); - LLVMValueRef j = LLVMBuildExtractElement( - gallivm->builder, interp_param, uint->one, ""); - emit_data->output[chan] = ac_build_fs_interp(&ctx->ac, - llvm_chan, attr_number, params, - i, j); - } else { - emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->i32, 2, 0), /* P0 */ - llvm_chan, attr_number, params); + LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size)); + unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); + + for (unsigned idx = 0; idx < input_array_size; ++idx) { + LLVMValueRef v, i = NULL, j = NULL; + + if (interp_param) { + i = LLVMBuildExtractElement( + ctx->ac.builder, interp_param, ctx->i32_0, ""); + j = LLVMBuildExtractElement( + ctx->ac.builder, interp_param, ctx->i32_1, ""); + } + v = si_build_fs_interp(ctx, input_base + idx, schan, + prim_mask, i, j); + + gather = LLVMBuildInsertElement(ctx->ac.builder, + gather, v, LLVMConstInt(ctx->i32, idx, false), ""); } + + emit_data->output[chan] = LLVMBuildExtractElement( + ctx->ac.builder, gather, array_idx, ""); } } -static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx, - LLVMValueRef value) +static void vote_all_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef args[3] = { - value, - ctx->i32_0, - LLVMConstInt(ctx->i32, LLVMIntNE, 0) - }; - - if (LLVMTypeOf(value) != ctx->i32) - args[0] = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); + struct si_shader_context *ctx = si_shader_context(bld_base); - return lp_build_intrinsic(gallivm->builder, - "llvm.amdgcn.icmp.i32", - ctx->i64, args, 3, - LP_FUNC_ATTR_NOUNWIND | - LP_FUNC_ATTR_READNONE | - LP_FUNC_ATTR_CONVERGENT); + LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]); + emit_data->output[emit_data->chan] = + LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); } -static void vote_all_emit( +static void vote_any_emit( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef active_set, vote_set; - LLVMValueRef tmp; - active_set = si_emit_ballot(ctx, ctx->i32_1); - vote_set = si_emit_ballot(ctx, emit_data->args[0]); + LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]); + emit_data->output[emit_data->chan] = + LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); +} + +static void vote_eq_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); - tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, ""); + LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]); emit_data->output[emit_data->chan] = - LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, ""); + LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); } -static void vote_any_emit( +static void ballot_emit( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef vote_set; + LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; - vote_set = si_emit_ballot(ctx, emit_data->args[0]); + tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); + tmp = ac_build_ballot(&ctx->ac, tmp); + tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, ""); - tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE, - vote_set, LLVMConstInt(ctx->i64, 0, 0), ""); - emit_data->output[emit_data->chan] = - LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, ""); + emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, ""); + emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, ""); } -static void vote_eq_emit( +static void read_invoc_fetch_args( + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, + 0, emit_data->src_chan); + + /* Always read the source invocation (= lane) from the X channel. */ + emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, + 1, TGSI_CHAN_X); + emit_data->arg_count = 2; +} + +static void read_lane_emit( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMValueRef active_set, vote_set; - LLVMValueRef all, none, tmp; - active_set = si_emit_ballot(ctx, ctx->i32_1); - vote_set = si_emit_ballot(ctx, emit_data->args[0]); + /* We currently have no other way to prevent LLVM from lifting the icmp + * calls to a dominating basic block. + */ + ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]); + + for (unsigned i = 0; i < emit_data->arg_count; ++i) + emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]); - all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, ""); - none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, - vote_set, LLVMConstInt(ctx->i64, 0, 0), ""); - tmp = LLVMBuildOr(gallivm->builder, all, none, ""); emit_data->output[emit_data->chan] = - LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, ""); + ac_build_intrinsic(&ctx->ac, action->intr_name, + ctx->i32, emit_data->args, emit_data->arg_count, + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); } static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, @@ -5199,29 +4056,24 @@ static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, } /* Emit one vertex from the geometry shader */ -static void si_llvm_emit_vertex( - const struct lp_build_tgsi_action *action, - struct lp_build_tgsi_context *bld_base, - struct lp_build_emit_data *emit_data) +static void si_llvm_emit_vertex(struct ac_shader_abi *abi, + unsigned stream, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context(bld_base); - struct lp_build_context *uint = &bld_base->uint_bld; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct tgsi_shader_info *info = &ctx->shader->selector->info; + struct lp_build_context *uint = &ctx->bld_base.uint_bld; struct si_shader *shader = ctx->shader; - struct tgsi_shader_info *info = &shader->selector->info; - struct gallivm_state *gallivm = bld_base->base.gallivm; struct lp_build_if_state if_state; LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, - SI_PARAM_GS2VS_OFFSET); + ctx->param_gs2vs_offset); LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit, kill; + LLVMValueRef can_emit; unsigned chan, offset; int i; - unsigned stream; - - stream = si_llvm_get_stream(bld_base, emit_data); /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(gallivm->builder, + gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], ""); @@ -5233,31 +4085,25 @@ static void si_llvm_emit_vertex( * further memory loads and may allow LLVM to skip to the end * altogether. */ - can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex, + can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, LLVMConstInt(ctx->i32, shader->selector->gs_max_out_vertices, 0), ""); bool use_kill = !info->writes_memory; if (use_kill) { - kill = lp_build_select(&bld_base->base, can_emit, - LLVMConstReal(ctx->f32, 1.0f), - LLVMConstReal(ctx->f32, -1.0f)); - - ac_build_kill(&ctx->ac, kill); + ac_build_kill_if_false(&ctx->ac, can_emit); } else { - lp_build_if(&if_state, gallivm, can_emit); + lp_build_if(&if_state, &ctx->gallivm, can_emit); } offset = 0; for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = ctx->outputs[i]; - for (chan = 0; chan < 4; chan++) { if (!(info->output_usagemask[i] & (1 << chan)) || ((info->output_streams[i] >> (2 * chan)) & 3) != stream) continue; - LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); LLVMValueRef voffset = LLVMConstInt(ctx->i32, offset * shader->selector->gs_max_out_vertices, 0); @@ -5266,7 +4112,7 @@ static void si_llvm_emit_vertex( voffset = lp_build_add(uint, voffset, gs_next_vertex); voffset = lp_build_mul_imm(uint, voffset, 4); - out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); + out_val = ac_to_integer(&ctx->ac, out_val); ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], @@ -5277,17 +4123,29 @@ static void si_llvm_emit_vertex( } gs_next_vertex = lp_build_add(uint, gs_next_vertex, - LLVMConstInt(ctx->i32, 1, 0)); + ctx->i32_1); - LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]); + LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); /* Signal vertex emission */ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), - LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); + si_get_gs_wave_id(ctx)); if (!use_kill) lp_build_endif(&if_state); } +/* Emit one vertex from the geometry shader */ +static void si_tgsi_emit_vertex( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + unsigned stream = si_llvm_get_stream(bld_base, emit_data); + + si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]); +} + /* Cut one primitive from the geometry shader */ static void si_llvm_emit_primitive( const struct lp_build_tgsi_action *action, @@ -5300,7 +4158,7 @@ static void si_llvm_emit_primitive( /* Signal primitive cut */ stream = si_llvm_get_stream(bld_base, emit_data); ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), - LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID)); + si_get_gs_wave_id(ctx)); } static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, @@ -5308,30 +4166,22 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; /* SI only (thanks to a hw bug workaround): * The real barrier instruction isn’t needed, because an entire patch * always fits into a single wave. */ - if (HAVE_LLVM >= 0x0309 && - ctx->screen->b.chip_class == SI && + if (ctx->screen->info.chip_class == SI && ctx->type == PIPE_SHADER_TESS_CTRL) { - emit_waitcnt(ctx, LGKM_CNT & VM_CNT); + si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT); return; } - lp_build_intrinsic(gallivm->builder, - HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier" - : "llvm.AMDGPU.barrier.local", + lp_build_intrinsic(ctx->ac.builder, + "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT); } -static const struct lp_build_tgsi_action tex_action = { - .fetch_args = tex_fetch_args, - .emit = build_tex_intrinsic, -}; - static const struct lp_build_tgsi_action interp_action = { .fetch_args = interp_fetch_args, .emit = build_interp_intrinsic, @@ -5340,17 +4190,16 @@ static const struct lp_build_tgsi_action interp_action = { static void si_create_function(struct si_shader_context *ctx, const char *name, LLVMTypeRef *returns, unsigned num_returns, - LLVMTypeRef *params, unsigned num_params, - int last_sgpr) + struct si_function_info *fninfo, + unsigned max_workgroup_size) { int i; si_llvm_create_func(ctx, name, returns, num_returns, - params, num_params); - si_llvm_shader_type(ctx->main_fn, ctx->type); + fninfo->types, fninfo->num_params); ctx->return_value = LLVMGetUndef(ctx->return_type); - for (i = 0; i <= last_sgpr; ++i) { + for (i = 0; i < fninfo->num_sgpr_params; ++i) { LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); /* The combination of: @@ -5368,11 +4217,20 @@ static void si_create_function(struct si_shader_context *ctx, lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG); } + for (i = 0; i < fninfo->num_params; ++i) { + if (fninfo->assign[i]) + *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i); + } + + if (max_workgroup_size) { + si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size", + max_workgroup_size); + } LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true"); - if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) { + if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) { /* These were copied from some LLVM test. */ LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "less-precise-fpmad", @@ -5391,66 +4249,46 @@ static void si_create_function(struct si_shader_context *ctx, static void declare_streamout_params(struct si_shader_context *ctx, struct pipe_stream_output_info *so, - LLVMTypeRef *params, LLVMTypeRef i32, - unsigned *num_params) + struct si_function_info *fninfo) { int i; /* Streamout SGPRs. */ if (so->num_outputs) { if (ctx->type != PIPE_SHADER_TESS_EVAL) - params[ctx->param_streamout_config = (*num_params)++] = i32; + ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); else - ctx->param_streamout_config = *num_params - 1; + ctx->param_streamout_config = fninfo->num_params - 1; - params[ctx->param_streamout_write_index = (*num_params)++] = i32; + ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); } /* A streamout buffer offset is loaded if the stride is non-zero. */ for (i = 0; i < 4; i++) { if (!so->stride[i]) continue; - params[ctx->param_streamout_offset[i] = (*num_params)++] = i32; + ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); } } -static unsigned llvm_get_type_size(LLVMTypeRef type) +static unsigned si_get_max_workgroup_size(const struct si_shader *shader) { - LLVMTypeKind kind = LLVMGetTypeKind(type); + switch (shader->selector->type) { + case PIPE_SHADER_TESS_CTRL: + /* Return this so that LLVM doesn't remove s_barrier + * instructions on chips where we use s_barrier. */ + return shader->selector->screen->info.chip_class >= CIK ? 128 : 64; + + case PIPE_SHADER_GEOMETRY: + return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64; + + case PIPE_SHADER_COMPUTE: + break; /* see below */ - switch (kind) { - case LLVMIntegerTypeKind: - return LLVMGetIntTypeWidth(type) / 8; - case LLVMFloatTypeKind: - return 4; - case LLVMPointerTypeKind: - return 8; - case LLVMVectorTypeKind: - return LLVMGetVectorSize(type) * - llvm_get_type_size(LLVMGetElementType(type)); - case LLVMArrayTypeKind: - return LLVMGetArrayLength(type) * - llvm_get_type_size(LLVMGetElementType(type)); default: - assert(0); return 0; } -} - -static void declare_tess_lds(struct si_shader_context *ctx) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct lp_build_context *uint = &bld_base->uint_bld; - - unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768; - ctx->lds = LLVMBuildIntToPtr(gallivm->builder, uint->zero, - LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE), - "tess_lds"); -} -static unsigned si_get_max_workgroup_size(struct si_shader *shader) -{ const unsigned *properties = shader->selector->info.properties; unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * @@ -5466,175 +4304,379 @@ static unsigned si_get_max_workgroup_size(struct si_shader *shader) return max_work_group_size; } +static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, + struct si_function_info *fninfo, + bool assign_params) +{ + LLVMTypeRef const_shader_buf_type; + + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_shader_buf_type = ctx->f32; + else + const_shader_buf_type = ctx->v4i32; + + unsigned const_and_shader_buffers = + add_arg(fninfo, ARG_SGPR, + si_const_array(const_shader_buf_type, 0)); + + unsigned samplers_and_images = + add_arg(fninfo, ARG_SGPR, + si_const_array(ctx->v8i32, + SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2)); + + if (assign_params) { + ctx->param_const_and_shader_buffers = const_and_shader_buffers; + ctx->param_samplers_and_images = samplers_and_images; + } +} + +static void declare_global_desc_pointers(struct si_shader_context *ctx, + struct si_function_info *fninfo) +{ + ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR, + si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS)); + ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR, + si_const_array(ctx->v8i32, 0)); +} + +static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx, + struct si_function_info *fninfo) +{ + ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR, + si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS)); + add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex); + add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance); + add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id); + ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32); +} + +static void declare_vs_input_vgprs(struct si_shader_context *ctx, + struct si_function_info *fninfo, + unsigned *num_prolog_vgprs) +{ + struct si_shader *shader = ctx->shader; + + add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id); + if (shader->key.as_ls) { + ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32); + add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); + } else { + add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); + ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32); + } + add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */ + + if (!shader->is_gs_copy_shader) { + /* Vertex load indices. */ + ctx->param_vertex_index0 = fninfo->num_params; + for (unsigned i = 0; i < shader->selector->info.num_inputs; i++) + add_arg(fninfo, ARG_VGPR, ctx->i32); + *num_prolog_vgprs += shader->selector->info.num_inputs; + } +} + +static void declare_tes_input_vgprs(struct si_shader_context *ctx, + struct si_function_info *fninfo) +{ + ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32); + ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32); + ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32); + ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32); +} + +enum { + /* Convenient merged shader definitions. */ + SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, + SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, +}; + static void create_function(struct si_shader_context *ctx) { - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS + SI_MAX_ATTRIBS], v3i32; + struct si_function_info fninfo; LLVMTypeRef returns[16+32*4]; - unsigned i, last_sgpr, num_params, num_return_sgprs; + unsigned i, num_return_sgprs; unsigned num_returns = 0; unsigned num_prolog_vgprs = 0; + unsigned type = ctx->type; + unsigned vs_blit_property = + shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + + si_init_function_info(&fninfo); - v3i32 = LLVMVectorType(ctx->i32, 3); + /* Set MERGED shaders. */ + if (ctx->screen->info.chip_class >= GFX9) { + if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) + type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ + else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY) + type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; + } - params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); - params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS); - params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS); - params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES); - params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS); + LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3); - switch (ctx->type) { + switch (type) { case PIPE_SHADER_VERTEX: - params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_MAX_ATTRIBS); - params[SI_PARAM_BASE_VERTEX] = ctx->i32; - params[SI_PARAM_START_INSTANCE] = ctx->i32; - params[SI_PARAM_DRAWID] = ctx->i32; - num_params = SI_PARAM_DRAWID+1; + declare_global_desc_pointers(ctx, &fninfo); + + if (vs_blit_property) { + ctx->param_vs_blit_inputs = fninfo.num_params; + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */ + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */ + } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ + add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ + } + + /* VGPRs */ + declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); + break; + } + + declare_per_stage_desc_pointers(ctx, &fninfo, true); + declare_vs_specific_input_sgprs(ctx, &fninfo); if (shader->key.as_es) { - params[ctx->param_es2gs_offset = num_params++] = ctx->i32; + ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); } else if (shader->key.as_ls) { - params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32; - num_params = SI_PARAM_LS_OUT_LAYOUT+1; + /* no extra parameters */ } else { if (shader->is_gs_copy_shader) { - num_params = SI_PARAM_RW_BUFFERS+1; - } else { - params[SI_PARAM_VS_STATE_BITS] = ctx->i32; - num_params = SI_PARAM_VS_STATE_BITS+1; + fninfo.num_params = ctx->param_rw_buffers + 1; + fninfo.num_sgpr_params = fninfo.num_params; } /* The locations of the other parameters are assigned dynamically. */ declare_streamout_params(ctx, &shader->selector->so, - params, ctx->i32, &num_params); + &fninfo); } - last_sgpr = num_params-1; - /* VGPRs */ - params[ctx->param_vertex_id = num_params++] = ctx->i32; - params[ctx->param_rel_auto_id = num_params++] = ctx->i32; - params[ctx->param_vs_prim_id = num_params++] = ctx->i32; - params[ctx->param_instance_id = num_params++] = ctx->i32; - - if (!shader->is_gs_copy_shader) { - /* Vertex load indices. */ - ctx->param_vertex_index0 = num_params; - - for (i = 0; i < shader->selector->info.num_inputs; i++) - params[num_params++] = ctx->i32; - - num_prolog_vgprs += shader->selector->info.num_inputs; - - /* PrimitiveID output. */ - if (!shader->key.as_es && !shader->key.as_ls) - for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) - returns[num_returns++] = ctx->f32; - } + declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); break; - case PIPE_SHADER_TESS_CTRL: - params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; - params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; - params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; - params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32; - params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32; - params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32; - last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; + case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */ + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, true); + ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* VGPRs */ - params[SI_PARAM_PATCH_ID] = ctx->i32; - params[SI_PARAM_REL_IDS] = ctx->i32; - num_params = SI_PARAM_REL_IDS+1; + ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32); + ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32); - /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are + /* param_tcs_offchip_offset and param_tcs_factor_offset are * placed after the user SGPRs. */ - for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++) + for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) returns[num_returns++] = ctx->i32; /* SGPRs */ - - for (i = 0; i < 3; i++) + for (i = 0; i < 11; i++) returns[num_returns++] = ctx->f32; /* VGPRs */ break; + case SI_SHADER_MERGED_VERTEX_TESSCTRL: + /* Merged stages have 8 system SGPRs at the beginning. */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_LO_HS */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_HI_HS */ + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, + ctx->type == PIPE_SHADER_VERTEX); + declare_vs_specific_input_sgprs(ctx, &fninfo); + + ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + + declare_per_stage_desc_pointers(ctx, &fninfo, + ctx->type == PIPE_SHADER_TESS_CTRL); + + /* VGPRs (first TCS, then VS) */ + ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32); + ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32); + + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_input_vgprs(ctx, &fninfo, + &num_prolog_vgprs); + + /* LS return values are inputs to the TCS main shader part. */ + for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + for (i = 0; i < 2; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } else { + /* TCS return values are inputs to the TCS epilog. + * + * param_tcs_offchip_offset, param_tcs_factor_offset, + * param_tcs_offchip_layout, and param_rw_buffers + * should be passed to the epilog. + */ + for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + for (i = 0; i < 11; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } + break; + + case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: + /* Merged stages have 8 system SGPRs at the beginning. */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_LO_GS) */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_HI_GS) */ + ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ + + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, + (ctx->type == PIPE_SHADER_VERTEX || + ctx->type == PIPE_SHADER_TESS_EVAL)); + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_specific_input_sgprs(ctx, &fninfo); + } else { + /* TESS_EVAL (and also GEOMETRY): + * Declare as many input SGPRs as the VS has. */ + ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ + } + + declare_per_stage_desc_pointers(ctx, &fninfo, + ctx->type == PIPE_SHADER_GEOMETRY); + + /* VGPRs (first GS, then VS/TES) */ + ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); + ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); + ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); + + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_input_vgprs(ctx, &fninfo, + &num_prolog_vgprs); + } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { + declare_tes_input_vgprs(ctx, &fninfo); + } + + if (ctx->type == PIPE_SHADER_VERTEX || + ctx->type == PIPE_SHADER_TESS_EVAL) { + /* ES return values are inputs to GS. */ + for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + for (i = 0; i < 5; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } + break; + case PIPE_SHADER_TESS_EVAL: - params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; - num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1; + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, true); + ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); if (shader->key.as_es) { - params[ctx->param_oc_lds = num_params++] = ctx->i32; - params[num_params++] = ctx->i32; - params[ctx->param_es2gs_offset = num_params++] = ctx->i32; + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); } else { - params[num_params++] = ctx->i32; + add_arg(&fninfo, ARG_SGPR, ctx->i32); declare_streamout_params(ctx, &shader->selector->so, - params, ctx->i32, &num_params); - params[ctx->param_oc_lds = num_params++] = ctx->i32; + &fninfo); + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); } - last_sgpr = num_params - 1; /* VGPRs */ - params[ctx->param_tes_u = num_params++] = ctx->f32; - params[ctx->param_tes_v = num_params++] = ctx->f32; - params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32; - params[ctx->param_tes_patch_id = num_params++] = ctx->i32; - - /* PrimitiveID output. */ - if (!shader->key.as_es) - for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) - returns[num_returns++] = ctx->f32; + declare_tes_input_vgprs(ctx, &fninfo); break; case PIPE_SHADER_GEOMETRY: - params[SI_PARAM_GS2VS_OFFSET] = ctx->i32; - params[SI_PARAM_GS_WAVE_ID] = ctx->i32; - last_sgpr = SI_PARAM_GS_WAVE_ID; + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, true); + ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* VGPRs */ - params[SI_PARAM_VTX0_OFFSET] = ctx->i32; - params[SI_PARAM_VTX1_OFFSET] = ctx->i32; - params[SI_PARAM_PRIMITIVE_ID] = ctx->i32; - params[SI_PARAM_VTX2_OFFSET] = ctx->i32; - params[SI_PARAM_VTX3_OFFSET] = ctx->i32; - params[SI_PARAM_VTX4_OFFSET] = ctx->i32; - params[SI_PARAM_VTX5_OFFSET] = ctx->i32; - params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32; - num_params = SI_PARAM_GS_INSTANCE_ID+1; + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]); + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); break; case PIPE_SHADER_FRAGMENT: - params[SI_PARAM_ALPHA_REF] = ctx->f32; - params[SI_PARAM_PRIM_MASK] = ctx->i32; - last_sgpr = SI_PARAM_PRIM_MASK; - params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32; - params[SI_PARAM_PERSP_CENTER] = ctx->v2i32; - params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32; - params[SI_PARAM_PERSP_PULL_MODEL] = v3i32; - params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32; - params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32; - params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32; - params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32; - params[SI_PARAM_POS_X_FLOAT] = ctx->f32; - params[SI_PARAM_POS_Y_FLOAT] = ctx->f32; - params[SI_PARAM_POS_Z_FLOAT] = ctx->f32; - params[SI_PARAM_POS_W_FLOAT] = ctx->f32; - params[SI_PARAM_FRONT_FACE] = ctx->i32; + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, true); + add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); + add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK); + + add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE); + add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER); + add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID); + add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL); + add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE); + add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER); + add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID); + add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX); + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, + &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT); + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, + &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT); + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, + &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT); + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, + &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT); + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, + &ctx->abi.front_face, SI_PARAM_FRONT_FACE); shader->info.face_vgpr_index = 20; - params[SI_PARAM_ANCILLARY] = ctx->i32; - params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; - params[SI_PARAM_POS_FIXED_PT] = ctx->i32; - num_params = SI_PARAM_POS_FIXED_PT+1; + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, + &ctx->abi.ancillary, SI_PARAM_ANCILLARY); + shader->info.ancillary_vgpr_index = 21; + add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, + &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); + add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT); /* Color inputs from the prolog. */ if (shader->selector->info.colors_read) { unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read); - assert(num_params + num_color_elements <= ARRAY_SIZE(params)); + assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types)); for (i = 0; i < num_color_elements; i++) - params[num_params++] = ctx->f32; + add_arg(&fninfo, ARG_VGPR, ctx->f32); num_prolog_vgprs += num_color_elements; } @@ -5660,23 +4702,28 @@ static void create_function(struct si_shader_context *ctx) break; case PIPE_SHADER_COMPUTE: - params[SI_PARAM_GRID_SIZE] = v3i32; - params[SI_PARAM_BLOCK_SIZE] = v3i32; - params[SI_PARAM_BLOCK_ID] = v3i32; - last_sgpr = SI_PARAM_BLOCK_ID; + declare_global_desc_pointers(ctx, &fninfo); + declare_per_stage_desc_pointers(ctx, &fninfo, true); + if (shader->selector->info.uses_grid_size) + ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32); + if (shader->selector->info.uses_block_size) + ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32); + + for (i = 0; i < 3; i++) { + ctx->param_block_id[i] = -1; + if (shader->selector->info.uses_block_id[i]) + ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32); + } - params[SI_PARAM_THREAD_ID] = v3i32; - num_params = SI_PARAM_THREAD_ID + 1; + ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32); break; default: assert(0 && "unimplemented shader"); return; } - assert(num_params <= ARRAY_SIZE(params)); - - si_create_function(ctx, "main", returns, num_returns, params, - num_params, last_sgpr); + si_create_function(ctx, "main", returns, num_returns, &fninfo, + si_get_max_workgroup_size(shader)); /* Reserve register locations for VGPR inputs the PS prolog may need. */ if (ctx->type == PIPE_SHADER_FRAGMENT && @@ -5690,42 +4737,27 @@ static void create_function(struct si_shader_context *ctx) S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) | S_0286D0_FRONT_FACE_ENA(1) | + S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); - } else if (ctx->type == PIPE_SHADER_COMPUTE) { - si_llvm_add_attribute(ctx->main_fn, - "amdgpu-max-work-group-size", - si_get_max_workgroup_size(shader)); } shader->info.num_input_sgprs = 0; shader->info.num_input_vgprs = 0; - for (i = 0; i <= last_sgpr; ++i) - shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4; + for (i = 0; i < fninfo.num_sgpr_params; ++i) + shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4; - for (; i < num_params; ++i) - shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4; + for (; i < fninfo.num_params; ++i) + shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4; assert(shader->info.num_input_vgprs >= num_prolog_vgprs); shader->info.num_input_vgprs -= num_prolog_vgprs; - if (!ctx->screen->has_ds_bpermute && - bld_base->info && - (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 || - bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0)) - ctx->lds = - LLVMAddGlobalInAddressSpace(gallivm->module, - LLVMArrayType(ctx->i32, 64), - "ddxy_lds", - LOCAL_ADDR_SPACE); - - if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) || - ctx->type == PIPE_SHADER_TESS_CTRL) - declare_tess_lds(ctx); + if (shader->key.as_ls || + ctx->type == PIPE_SHADER_TESS_CTRL || + /* GFX9 has the ESGS ring buffer in LDS. */ + type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY) + ac_declare_lds_as_pointer(&ctx->ac); } /** @@ -5734,38 +4766,33 @@ static void create_function(struct si_shader_context *ctx) */ static void preload_ring_buffers(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; - LLVMBuilderRef builder = gallivm->builder; + LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, - SI_PARAM_RW_BUFFERS); + ctx->param_rw_buffers); - if ((ctx->type == PIPE_SHADER_VERTEX && - ctx->shader->key.as_es) || - (ctx->type == PIPE_SHADER_TESS_EVAL && - ctx->shader->key.as_es) || - ctx->type == PIPE_SHADER_GEOMETRY) { + if (ctx->screen->info.chip_class <= VI && + (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) { unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS; LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); ctx->esgs_ring = - ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); } if (ctx->shader->is_gs_copy_shader) { LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); ctx->gsvs_ring[0] = - ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); } else if (ctx->type == PIPE_SHADER_GEOMETRY) { const struct si_shader_selector *sel = ctx->shader->selector; - struct lp_build_context *uint = &ctx->bld_base.uint_bld; LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); LLVMValueRef base_ring; - base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset); + base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); /* The conceptual layout of the GSVS ring is * v0c0 .. vLv0 v0c1 .. vLc1 .. @@ -5796,20 +4823,20 @@ static void preload_ring_buffers(struct si_shader_context *ctx) num_records = 64; ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); - tmp = LLVMBuildExtractElement(builder, ring, uint->zero, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->i64, stream_offset, 0), ""); stream_offset += stride * 64; - ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, ""); + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); - tmp = LLVMBuildExtractElement(builder, ring, uint->one, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); tmp = LLVMBuildOr(builder, tmp, LLVMConstInt(ctx->i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), ""); - ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, ""); + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->i32, num_records, 0), LLVMConstInt(ctx->i32, 2, 0), ""); @@ -5826,7 +4853,6 @@ static void preload_ring_buffers(struct si_shader_context *ctx) S_008F0C_ADD_TID_ENABLE(1), 0), LLVMConstInt(ctx->i32, 3, 0), ""); - ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, ""); ctx->gsvs_ring[stream] = ring; } @@ -5837,9 +4863,7 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, LLVMValueRef param_rw_buffers, unsigned param_pos_fixed_pt) { - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMBuilderRef builder = gallivm->builder; + LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef slot, desc, offset, row, bit, address[2]; /* Use the fixed-point gl_FragCoord input. @@ -5851,20 +4875,16 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, /* Load the buffer descriptor. */ slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0); - desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot); + desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); /* The stipple pattern is 32x32, each row has 32 bits. */ offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->i32, 4, 0), ""); row = buffer_load_const(ctx, desc, offset); - row = LLVMBuildBitCast(builder, row, ctx->i32, ""); + row = ac_to_integer(&ctx->ac, row); bit = LLVMBuildLShr(builder, row, address[0], ""); bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); - - /* The intrinsic kills the thread if arg < 0. */ - bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0), - LLVMConstReal(ctx->f32, -1), ""); - ac_build_kill(&ctx->ac, bit); + ac_build_kill_if_false(&ctx->ac, bit); } void si_shader_binary_read_config(struct ac_shader_binary *binary, @@ -5900,6 +4920,7 @@ void si_shader_binary_read_config(struct ac_shader_binary *binary, case R_00B028_SPI_SHADER_PGM_RSRC1_PS: case R_00B128_SPI_SHADER_PGM_RSRC1_VS: case R_00B228_SPI_SHADER_PGM_RSRC1_GS: + case R_00B428_SPI_SHADER_PGM_RSRC1_HS: case R_00B848_COMPUTE_PGM_RSRC1: conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); @@ -5950,24 +4971,16 @@ void si_shader_binary_read_config(struct ac_shader_binary *binary, conf->spi_ps_input_addr = conf->spi_ps_input_ena; } -void si_shader_apply_scratch_relocs(struct si_context *sctx, - struct si_shader *shader, - struct si_shader_config *config, - uint64_t scratch_va) +void si_shader_apply_scratch_relocs(struct si_shader *shader, + uint64_t scratch_va) { unsigned i; uint32_t scratch_rsrc_dword0 = scratch_va; uint32_t scratch_rsrc_dword1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); - /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE - * correctly. - */ - if (HAVE_LLVM >= 0x0309) - scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); - else - scratch_rsrc_dword1 |= - S_008F04_STRIDE(config->scratch_bytes_per_wave / 64); + /* Enable scratch coalescing. */ + scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); for (i = 0 ; i < shader->binary.reloc_count; i++) { const struct ac_shader_reloc *reloc = @@ -5982,12 +4995,16 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, } } -static unsigned si_get_shader_binary_size(struct si_shader *shader) +static unsigned si_get_shader_binary_size(const struct si_shader *shader) { unsigned size = shader->binary.code_size; if (shader->prolog) size += shader->prolog->binary.code_size; + if (shader->previous_stage) + size += shader->previous_stage->binary.code_size; + if (shader->prolog2) + size += shader->prolog2->binary.code_size; if (shader->epilog) size += shader->epilog->binary.code_size; return size; @@ -5997,6 +5014,10 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) { const struct ac_shader_binary *prolog = shader->prolog ? &shader->prolog->binary : NULL; + const struct ac_shader_binary *previous_stage = + shader->previous_stage ? &shader->previous_stage->binary : NULL; + const struct ac_shader_binary *prolog2 = + shader->prolog2 ? &shader->prolog2->binary : NULL; const struct ac_shader_binary *epilog = shader->epilog ? &shader->epilog->binary : NULL; const struct ac_shader_binary *mainb = &shader->binary; @@ -6005,41 +5026,49 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) unsigned char *ptr; assert(!prolog || !prolog->rodata_size); - assert((!prolog && !epilog) || !mainb->rodata_size); + assert(!previous_stage || !previous_stage->rodata_size); + assert(!prolog2 || !prolog2->rodata_size); + assert((!prolog && !previous_stage && !prolog2 && !epilog) || + !mainb->rodata_size); assert(!epilog || !epilog->rodata_size); - /* GFX9 can fetch at most 128 bytes past the end of the shader. - * Prevent VM faults. - */ - if (sscreen->b.chip_class >= GFX9) - bo_size += 128; - r600_resource_reference(&shader->bo, NULL); shader->bo = (struct r600_resource*) - pipe_buffer_create(&sscreen->b.b, 0, + pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_IMMUTABLE, align(bo_size, SI_CPDMA_ALIGNMENT)); if (!shader->bo) return -ENOMEM; /* Upload. */ - ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL, - PIPE_TRANSFER_READ_WRITE); + ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, + PIPE_TRANSFER_READ_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED); + /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are + * endian-independent. */ if (prolog) { - util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size); + memcpy(ptr, prolog->code, prolog->code_size); ptr += prolog->code_size; } + if (previous_stage) { + memcpy(ptr, previous_stage->code, previous_stage->code_size); + ptr += previous_stage->code_size; + } + if (prolog2) { + memcpy(ptr, prolog2->code, prolog2->code_size); + ptr += prolog2->code_size; + } - util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size); + memcpy(ptr, mainb->code, mainb->code_size); ptr += mainb->code_size; if (epilog) - util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size); + memcpy(ptr, epilog->code, epilog->code_size); else if (mainb->rodata_size > 0) - util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size); + memcpy(ptr, mainb->rodata, mainb->rodata_size); - sscreen->b.ws->buffer_unmap(shader->bo->buf); + sscreen->ws->buffer_unmap(shader->bo->buf); return 0; } @@ -6092,18 +5121,29 @@ static void si_shader_dump_disassembly(const struct ac_shader_binary *binary, } static void si_shader_dump_stats(struct si_screen *sscreen, - struct si_shader *shader, + const struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, FILE *file, bool check_debug_option) { - struct si_shader_config *conf = &shader->config; + const struct si_shader_config *conf = &shader->config; unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0; unsigned code_size = si_get_shader_binary_size(shader); - unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256; + unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256; unsigned lds_per_wave = 0; - unsigned max_simd_waves = 10; + unsigned max_simd_waves; + + switch (sscreen->info.family) { + /* These always have 8 waves: */ + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + max_simd_waves = 8; + break; + default: + max_simd_waves = 10; + } /* Compute LDS usage for PS. */ switch (processor) { @@ -6133,7 +5173,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen, /* Compute the per-SIMD wave counts. */ if (conf->num_sgprs) { - if (sscreen->b.chip_class >= VI) + if (sscreen->info.chip_class >= VI) max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs); else max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs); @@ -6148,7 +5188,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen, max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); if (!check_debug_option || - r600_can_dump_shader(&sscreen->b, processor)) { + si_can_dump_shader(sscreen, processor)) { if (processor == PIPE_SHADER_FRAGMENT) { fprintf(file, "*** SHADER CONFIG ***\n" "SPI_PS_INPUT_ADDR = 0x%04x\n" @@ -6184,7 +5224,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen, conf->spilled_vgprs, conf->private_mem_vgprs); } -const char *si_get_shader_name(struct si_shader *shader, unsigned processor) +const char *si_get_shader_name(const struct si_shader *shader, unsigned processor) { switch (processor) { case PIPE_SHADER_VERTEX: @@ -6215,28 +5255,41 @@ const char *si_get_shader_name(struct si_shader *shader, unsigned processor) } } -void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, +void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, FILE *file, bool check_debug_option) { if (!check_debug_option || - r600_can_dump_shader(&sscreen->b, processor)) - si_dump_shader_key(processor, &shader->key, file); + si_can_dump_shader(sscreen, processor)) + si_dump_shader_key(processor, shader, file); if (!check_debug_option && shader->binary.llvm_ir_string) { + if (shader->previous_stage && + shader->previous_stage->binary.llvm_ir_string) { + fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", + si_get_shader_name(shader, processor)); + fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); + } + fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader, processor)); fprintf(file, "%s\n", shader->binary.llvm_ir_string); } if (!check_debug_option || - (r600_can_dump_shader(&sscreen->b, processor) && - !(sscreen->b.debug_flags & DBG_NO_ASM))) { + (si_can_dump_shader(sscreen, processor) && + !(sscreen->debug_flags & DBG(NO_ASM)))) { fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor)); if (shader->prolog) si_shader_dump_disassembly(&shader->prolog->binary, debug, "prolog", file); + if (shader->previous_stage) + si_shader_dump_disassembly(&shader->previous_stage->binary, + debug, "previous stage", file); + if (shader->prolog2) + si_shader_dump_disassembly(&shader->prolog2->binary, + debug, "prolog2", file); si_shader_dump_disassembly(&shader->binary, debug, "main", file); @@ -6250,22 +5303,22 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, check_debug_option); } -int si_compile_llvm(struct si_screen *sscreen, - struct ac_shader_binary *binary, - struct si_shader_config *conf, - LLVMTargetMachineRef tm, - LLVMModuleRef mod, - struct pipe_debug_callback *debug, - unsigned processor, - const char *name) +static int si_compile_llvm(struct si_screen *sscreen, + struct ac_shader_binary *binary, + struct si_shader_config *conf, + LLVMTargetMachineRef tm, + LLVMModuleRef mod, + struct pipe_debug_callback *debug, + unsigned processor, + const char *name) { int r = 0; - unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations); + unsigned count = p_atomic_inc_return(&sscreen->num_compilations); - if (r600_can_dump_shader(&sscreen->b, processor)) { + if (si_can_dump_shader(sscreen, processor)) { fprintf(stderr, "radeonsi: Compiling shader %d\n", count); - if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) { + if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { fprintf(stderr, "%s LLVM IR:\n\n", name); ac_dump_module(mod); fprintf(stderr, "\n"); @@ -6323,9 +5376,9 @@ int si_compile_llvm(struct si_screen *sscreen, static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) { if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(ctx->gallivm.builder); + LLVMBuildRetVoid(ctx->ac.builder); else - LLVMBuildRet(ctx->gallivm.builder, ret); + LLVMBuildRet(ctx->ac.builder, ret); } /* Generate code for the hardware VS shader stage to go with a geometry shader */ @@ -6337,7 +5390,6 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, { struct si_shader_context ctx; struct si_shader *shader; - struct gallivm_state *gallivm = &ctx.gallivm; LLVMBuilderRef builder; struct lp_build_tgsi_context *bld_base = &ctx.bld_base; struct lp_build_context *uint = &bld_base->uint_bld; @@ -6356,21 +5408,24 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, return NULL; } + /* We can leave the fence as permanently signaled because the GS copy + * shader only becomes visible globally after it has been compiled. */ + util_queue_fence_init(&shader->ready); shader->selector = gs_selector; shader->is_gs_copy_shader = true; - si_init_shader_ctx(&ctx, sscreen, shader, tm); + si_init_shader_ctx(&ctx, sscreen, tm); + ctx.shader = shader; ctx.type = PIPE_SHADER_VERTEX; - builder = gallivm->builder; + builder = ctx.ac.builder; create_function(&ctx); preload_ring_buffers(&ctx); LLVMValueRef voffset = - lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn, - ctx.param_vertex_id), 4); + lp_build_mul_imm(uint, ctx.abi.vertex_id, 4); /* Fetch the vertex stream ID.*/ LLVMValueRef stream_id; @@ -6378,7 +5433,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, if (gs_selector->so.num_outputs) stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2); else - stream_id = uint->zero; + stream_id = ctx.i32_0; /* Fill in output information. */ for (i = 0; i < gsinfo->num_outputs; ++i) { @@ -6394,7 +5449,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, LLVMBasicBlockRef end_bb; LLVMValueRef switch_inst; - end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end"); + end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); for (int stream = 0; stream < 4; stream++) { @@ -6407,7 +5462,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, if (stream > 0 && !gs_selector->so.num_outputs) continue; - bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out"); + bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); LLVMPositionBuilderAtEnd(builder, bb); @@ -6428,8 +5483,9 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, outputs[i].values[chan] = ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, - uint->zero, voffset, - soffset, 0, 1, 1, true); + ctx.i32_0, voffset, + soffset, 0, 1, 1, + true, false); } } @@ -6441,30 +5497,25 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, } if (stream == 0) - si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); + si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); LLVMBuildBr(builder, end_bb); } LLVMPositionBuilderAtEnd(builder, end_bb); - LLVMBuildRetVoid(gallivm->builder); - - /* Dump LLVM IR before any optimization passes */ - if (sscreen->b.debug_flags & DBG_PREOPT_IR && - r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) - ac_dump_module(bld_base->base.gallivm->module); + LLVMBuildRetVoid(ctx.ac.builder); - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY)); + ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ + si_llvm_optimize_module(&ctx); r = si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.tm, - bld_base->base.gallivm->module, + ctx.gallivm.module, debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader"); if (!r) { - if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) + if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) fprintf(stderr, "GS Copy Shader:\n"); si_shader_dump(sscreen, ctx.shader, debug, PIPE_SHADER_GEOMETRY, stderr, true); @@ -6482,41 +5533,64 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, return shader; } -static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, +static void si_dump_shader_key_vs(const struct si_shader_key *key, + const struct si_vs_prolog_bits *prolog, + const char *prefix, FILE *f) +{ + fprintf(f, " %s.instance_divisor_is_one = %u\n", + prefix, prolog->instance_divisor_is_one); + fprintf(f, " %s.instance_divisor_is_fetched = %u\n", + prefix, prolog->instance_divisor_is_fetched); + fprintf(f, " %s.ls_vgpr_fix = %u\n", + prefix, prolog->ls_vgpr_fix); + + fprintf(f, " mono.vs.fix_fetch = {"); + for (int i = 0; i < SI_MAX_ATTRIBS; i++) + fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); + fprintf(f, "}\n"); +} + +static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, FILE *f) { - int i; + const struct si_shader_key *key = &shader->key; fprintf(f, "SHADER KEY\n"); - switch (shader) { + switch (processor) { case PIPE_SHADER_VERTEX: - fprintf(f, " part.vs.prolog.instance_divisors = {"); - for (i = 0; i < ARRAY_SIZE(key->part.vs.prolog.instance_divisors); i++) - fprintf(f, !i ? "%u" : ", %u", - key->part.vs.prolog.instance_divisors[i]); - fprintf(f, "}\n"); - fprintf(f, " part.vs.epilog.export_prim_id = %u\n", key->part.vs.epilog.export_prim_id); + si_dump_shader_key_vs(key, &key->part.vs.prolog, + "part.vs.prolog", f); fprintf(f, " as_es = %u\n", key->as_es); fprintf(f, " as_ls = %u\n", key->as_ls); - - fprintf(f, " mono.vs.fix_fetch = {"); - for (i = 0; i < SI_MAX_ATTRIBS; i++) - fprintf(f, !i ? "%u" : ", %u", key->mono.vs.fix_fetch[i]); - fprintf(f, "}\n"); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", + key->mono.u.vs_export_prim_id); break; case PIPE_SHADER_TESS_CTRL: + if (shader->selector->screen->info.chip_class >= GFX9) { + si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, + "part.tcs.ls_prolog", f); + } fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); - fprintf(f, " mono.tcs.inputs_to_copy = 0x%"PRIx64"\n", key->mono.tcs.inputs_to_copy); + fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); break; case PIPE_SHADER_TESS_EVAL: - fprintf(f, " part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id); fprintf(f, " as_es = %u\n", key->as_es); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", + key->mono.u.vs_export_prim_id); break; case PIPE_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + break; + + if (shader->selector->screen->info.chip_class >= GFX9 && + key->part.gs.es->type == PIPE_SHADER_VERTEX) { + si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, + "part.gs.vs_prolog", f); + } fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); break; @@ -6547,27 +5621,22 @@ static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, assert(0); } - if ((shader == PIPE_SHADER_GEOMETRY || - shader == PIPE_SHADER_TESS_EVAL || - shader == PIPE_SHADER_VERTEX) && + if ((processor == PIPE_SHADER_GEOMETRY || + processor == PIPE_SHADER_TESS_EVAL || + processor == PIPE_SHADER_VERTEX) && !key->as_es && !key->as_ls) { - fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs); - fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2); - fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable); + fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); + fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); } } static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, - struct si_shader *shader, LLVMTargetMachineRef tm) { struct lp_build_tgsi_context *bld_base; - struct lp_build_tgsi_action tmpl = {}; - si_llvm_context_init(ctx, sscreen, shader, tm, - (shader && shader->selector) ? &shader->selector->info : NULL, - (shader && shader->selector) ? shader->selector->tokens : NULL); + si_llvm_context_init(ctx, sscreen, tm); bld_base = &ctx->bld_base; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; @@ -6576,53 +5645,6 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action; bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action; - bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit; - bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action; - bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action; - bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs; - - bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args; - bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit; - bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args; - bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit; - bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args; - bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit; - - tmpl.fetch_args = atomic_fetch_args; - tmpl.emit = atomic_emit; - bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add"; - bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap"; - bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap"; - bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and"; - bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or"; - bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor"; - bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin"; - bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax"; - bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin"; - bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl; - bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax"; - bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit; @@ -6630,172 +5652,39 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; - bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; - - bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit; - bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit; - bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit; - - bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex; - bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive; - bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; -} - -#define EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3) -#define EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5) - -/* Return true if the PARAM export has been eliminated. */ -static bool si_eliminate_const_output(struct si_shader_context *ctx, - LLVMValueRef inst, unsigned offset) -{ - struct si_shader *shader = ctx->shader; - unsigned num_outputs = shader->selector->info.num_outputs; - unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ - bool is_zero[4] = {}, is_one[4] = {}; - - for (i = 0; i < 4; i++) { - LLVMBool loses_info; - LLVMValueRef p = LLVMGetOperand(inst, EXP_OUT0 + i); - - /* It's a constant expression. Undef outputs are eliminated too. */ - if (LLVMIsUndef(p)) { - is_zero[i] = true; - is_one[i] = true; - } else if (LLVMIsAConstantFP(p)) { - double a = LLVMConstRealGetDouble(p, &loses_info); - - if (a == 0) - is_zero[i] = true; - else if (a == 1) - is_one[i] = true; - else - return false; /* other constant */ - } else - return false; - } - - /* Only certain combinations of 0 and 1 can be eliminated. */ - if (is_zero[0] && is_zero[1] && is_zero[2]) - default_val = is_zero[3] ? 0 : 1; - else if (is_one[0] && is_one[1] && is_one[2]) - default_val = is_zero[3] ? 2 : 3; - else - return false; - - /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ - LLVMInstructionEraseFromParent(inst); - - /* Change OFFSET to DEFAULT_VAL. */ - for (i = 0; i < num_outputs; i++) { - if (shader->info.vs_output_param_offset[i] == offset) { - shader->info.vs_output_param_offset[i] = - EXP_PARAM_DEFAULT_VAL_0000 + default_val; - break; - } - } - return true; -} - -struct si_vs_exports { - unsigned num; - unsigned offset[SI_MAX_VS_OUTPUTS]; - LLVMValueRef inst[SI_MAX_VS_OUTPUTS]; -}; - -static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx) -{ - struct si_shader *shader = ctx->shader; - struct tgsi_shader_info *info = &shader->selector->info; - LLVMBasicBlockRef bb; - struct si_vs_exports exports; - bool removed_any = false; - - exports.num = 0; - - if (ctx->type == PIPE_SHADER_FRAGMENT || - ctx->type == PIPE_SHADER_COMPUTE || - shader->key.as_es || - shader->key.as_ls) - return; - - /* Process all LLVM instructions. */ - bb = LLVMGetFirstBasicBlock(ctx->main_fn); - while (bb) { - LLVMValueRef inst = LLVMGetFirstInstruction(bb); - - while (inst) { - LLVMValueRef cur = inst; - inst = LLVMGetNextInstruction(inst); - - if (LLVMGetInstructionOpcode(cur) != LLVMCall) - continue; - - LLVMValueRef callee = lp_get_called_value(cur); - - if (!lp_is_function(callee)) - continue; - - const char *name = LLVMGetValueName(callee); - unsigned num_args = LLVMCountParams(callee); - - /* Check if this is an export instruction. */ - if ((num_args != 9 && num_args != 8) || - (strcmp(name, "llvm.SI.export") && - strcmp(name, "llvm.amdgcn.exp.f32"))) - continue; - - LLVMValueRef arg = LLVMGetOperand(cur, EXP_TARGET); - unsigned target = LLVMConstIntGetZExtValue(arg); - - if (target < V_008DFC_SQ_EXP_PARAM) - continue; + bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; - target -= V_008DFC_SQ_EXP_PARAM; + bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit; + bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit; + bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit; + bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit; + bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane"; + bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit; + bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane"; + bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args; + bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit; + + bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex; + bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive; + bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; +} - /* Eliminate constant value PARAM exports. */ - if (si_eliminate_const_output(ctx, cur, target)) { - removed_any = true; - } else { - exports.offset[exports.num] = target; - exports.inst[exports.num] = cur; - exports.num++; - } - } - bb = LLVMGetNextBasicBlock(bb); - } +static void si_optimize_vs_outputs(struct si_shader_context *ctx) +{ + struct si_shader *shader = ctx->shader; + struct tgsi_shader_info *info = &shader->selector->info; - /* Remove holes in export memory due to removed PARAM exports. - * This is done by renumbering all PARAM exports. - */ - if (removed_any) { - ubyte current_offset[SI_MAX_VS_OUTPUTS]; - unsigned new_count = 0; - unsigned out, i; - - /* Make a copy of the offsets. We need the old version while - * we are modifying some of them. */ - assert(sizeof(current_offset) == - sizeof(shader->info.vs_output_param_offset)); - memcpy(current_offset, shader->info.vs_output_param_offset, - sizeof(current_offset)); - - for (i = 0; i < exports.num; i++) { - unsigned offset = exports.offset[i]; - - for (out = 0; out < info->num_outputs; out++) { - if (current_offset[out] != offset) - continue; + if ((ctx->type != PIPE_SHADER_VERTEX && + ctx->type != PIPE_SHADER_TESS_EVAL) || + shader->key.as_ls || + shader->key.as_es) + return; - LLVMSetOperand(exports.inst[i], EXP_TARGET, - LLVMConstInt(ctx->i32, - V_008DFC_SQ_EXP_PARAM + new_count, 0)); - shader->info.vs_output_param_offset[out] = new_count; - new_count++; - break; - } - } - shader->info.nr_param_exports = new_count; - } + ac_optimize_vs_outputs(&ctx->ac, + ctx->main_fn, + shader->info.vs_output_param_offset, + info->num_outputs, + &shader->info.nr_param_exports); } static void si_count_scratch_private_memory(struct si_shader_context *ctx) @@ -6817,28 +5706,59 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx) LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); /* No idea why LLVM aligns allocas to 4 elements. */ unsigned alignment = LLVMGetAlignment(inst); - unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); + unsigned dw_size = align(ac_get_type_size(type) / 4, alignment); ctx->shader->config.private_mem_vgprs += dw_size; } bb = LLVMGetNextBasicBlock(bb); } } +static void si_init_exec_full_mask(struct si_shader_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + lp_build_intrinsic(ctx->ac.builder, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); +} + +static void si_init_exec_from_input(struct si_shader_context *ctx, + unsigned param, unsigned bitoffset) +{ + LLVMValueRef args[] = { + LLVMGetParam(ctx->main_fn, param), + LLVMConstInt(ctx->i32, bitoffset, 0), + }; + lp_build_intrinsic(ctx->ac.builder, + "llvm.amdgcn.init.exec.from.input", + ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); +} + +static bool si_vs_needs_prolog(const struct si_shader_selector *sel, + const struct si_vs_prolog_bits *key) +{ + /* VGPR initialization fixup for Vega10 and Raven is always done in the + * VS prolog. */ + return sel->vs_needs_prolog || key->ls_vgpr_fix; +} + static bool si_compile_tgsi_main(struct si_shader_context *ctx, - struct si_shader *shader) + bool is_monolithic) { + struct si_shader *shader = ctx->shader; struct si_shader_selector *sel = shader->selector; struct lp_build_tgsi_context *bld_base = &ctx->bld_base; + // TODO clean all this up! switch (ctx->type) { case PIPE_SHADER_VERTEX: ctx->load_input = declare_input_vs; if (shader->key.as_ls) - bld_base->emit_epilogue = si_llvm_emit_ls_epilogue; + ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; else if (shader->key.as_es) - bld_base->emit_epilogue = si_llvm_emit_es_epilogue; + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; else - bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + bld_base->emit_epilogue = si_tgsi_emit_epilogue; break; case PIPE_SHADER_TESS_CTRL: bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; @@ -6849,41 +5769,108 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, case PIPE_SHADER_TESS_EVAL: bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; if (shader->key.as_es) - bld_base->emit_epilogue = si_llvm_emit_es_epilogue; + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; else - bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + bld_base->emit_epilogue = si_tgsi_emit_epilogue; break; case PIPE_SHADER_GEOMETRY: bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; - bld_base->emit_epilogue = si_llvm_emit_gs_epilogue; + ctx->abi.emit_vertex = si_llvm_emit_vertex; + ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; + bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue; break; case PIPE_SHADER_FRAGMENT: ctx->load_input = declare_input_fs; - bld_base->emit_epilogue = si_llvm_return_fs_outputs; + ctx->abi.emit_outputs = si_llvm_return_fs_outputs; + bld_base->emit_epilogue = si_tgsi_emit_epilogue; break; case PIPE_SHADER_COMPUTE: - ctx->declare_memory_region = declare_compute_memory; break; default: assert(!"Unsupported shader type"); return false; } + ctx->abi.load_ubo = load_ubo; + ctx->abi.load_ssbo = load_ssbo; + create_function(ctx); preload_ring_buffers(ctx); + /* For GFX9 merged shaders: + * - Set EXEC for the first shader. If the prolog is present, set + * EXEC there instead. + * - Add a barrier before the second shader. + * - In the second shader, reset EXEC to ~0 and wrap the main part in + * an if-statement. This is required for correctness in geometry + * shaders, to ensure that empty GS waves do not send GS_EMIT and + * GS_CUT messages. + * + * For monolithic merged shaders, the first shader is wrapped in an + * if-block together with its prolog in si_build_wrapper_function. + */ + if (ctx->screen->info.chip_class >= GFX9) { + if (!is_monolithic && + sel->info.num_instructions > 1 && /* not empty shader */ + (shader->key.as_es || shader->key.as_ls) && + (ctx->type == PIPE_SHADER_TESS_EVAL || + (ctx->type == PIPE_SHADER_VERTEX && + !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) { + si_init_exec_from_input(ctx, + ctx->param_merged_wave_info, 0); + } else if (ctx->type == PIPE_SHADER_TESS_CTRL || + ctx->type == PIPE_SHADER_GEOMETRY) { + if (!is_monolithic) + si_init_exec_full_mask(ctx); + + /* The barrier must execute for all shaders in a + * threadgroup. + */ + si_llvm_emit_barrier(NULL, bld_base, NULL); + + LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); + LLVMValueRef ena = + LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), num_threads, ""); + lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena); + } + } + + if (ctx->type == PIPE_SHADER_TESS_CTRL && + sel->tcs_info.tessfactors_are_def_in_all_invocs) { + for (unsigned i = 0; i < 6; i++) { + ctx->invoc0_tess_factors[i] = + lp_build_alloca_undef(&ctx->gallivm, ctx->i32, ""); + } + } + if (ctx->type == PIPE_SHADER_GEOMETRY) { int i; for (i = 0; i < 4; i++) { ctx->gs_next_vertex[i] = - lp_build_alloca(bld_base->base.gallivm, + lp_build_alloca(&ctx->gallivm, ctx->i32, ""); } } - if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { - fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); - return false; + if (sel->force_correct_derivs_after_kill) { + ctx->postponed_kill = lp_build_alloca_undef(&ctx->gallivm, ctx->i1, ""); + /* true = don't kill. */ + LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0), + ctx->postponed_kill); + } + + if (sel->tokens) { + if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { + fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); + return false; + } + } else { + if (!si_nir_build_llvm(ctx, sel->nir)) { + fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); + return false; + } } si_llvm_build_ret(ctx, ctx->return_value); @@ -6893,43 +5880,40 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, /** * Compute the VS prolog key, which contains all the information needed to * build the VS prolog function, and set shader->info bits where needed. + * + * \param info Shader info of the vertex shader. + * \param num_input_sgprs Number of input SGPRs for the vertex shader. + * \param prolog_key Key of the VS prolog + * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. + * \param key Output shader part key. */ -static void si_get_vs_prolog_key(struct si_shader *shader, +static void si_get_vs_prolog_key(const struct tgsi_shader_info *info, + unsigned num_input_sgprs, + const struct si_vs_prolog_bits *prolog_key, + struct si_shader *shader_out, union si_shader_part_key *key) { - struct tgsi_shader_info *info = &shader->selector->info; - memset(key, 0, sizeof(*key)); - key->vs_prolog.states = shader->key.part.vs.prolog; - key->vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; + key->vs_prolog.states = *prolog_key; + key->vs_prolog.num_input_sgprs = num_input_sgprs; key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; + key->vs_prolog.as_ls = shader_out->key.as_ls; + key->vs_prolog.as_es = shader_out->key.as_es; - /* Set the instanceID flag. */ - for (unsigned i = 0; i < info->num_inputs; i++) - if (key->vs_prolog.states.instance_divisors[i]) - shader->info.uses_instanceid = true; -} - -/** - * Compute the VS epilog key, which contains all the information needed to - * build the VS epilog function, and set the PrimitiveID output offset. - */ -static void si_get_vs_epilog_key(struct si_shader *shader, - struct si_vs_epilog_bits *states, - union si_shader_part_key *key) -{ - memset(key, 0, sizeof(*key)); - key->vs_epilog.states = *states; + if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { + key->vs_prolog.as_ls = 1; + key->vs_prolog.num_merged_next_stage_vgprs = 2; + } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { + key->vs_prolog.as_es = 1; + key->vs_prolog.num_merged_next_stage_vgprs = 5; + } - /* Set up the PrimitiveID output. */ - if (shader->key.part.vs.epilog.export_prim_id) { - unsigned index = shader->selector->info.num_outputs; - unsigned offset = shader->info.nr_param_exports++; + /* Enable loading the InstanceID VGPR. */ + uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); - key->vs_epilog.prim_id_param_offset = offset; - assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[index] = offset; - } + if ((key->vs_prolog.states.instance_divisor_is_one | + key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) + shader_out->info.uses_instanceid = true; } /** @@ -6955,6 +5939,7 @@ static void si_get_ps_prolog_key(struct si_shader *shader, key->ps_prolog.states.force_linear_center_interp || key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear); + key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; if (info->colors_read) { unsigned *color = shader->selector->color_attr_index; @@ -7064,7 +6049,8 @@ static bool si_need_ps_prolog(const union si_shader_part_key *key) key->ps_prolog.states.force_linear_center_interp || key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear || - key->ps_prolog.states.poly_stipple; + key->ps_prolog.states.poly_stipple || + key->ps_prolog.states.samplemask_log_ps_iter; } /** @@ -7090,29 +6076,44 @@ static void si_get_ps_epilog_key(struct si_shader *shader, static void si_build_gs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2; - const unsigned num_vgprs = 8; - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef params[32]; - LLVMTypeRef returns[32]; + unsigned num_sgprs, num_vgprs; + struct si_function_info fninfo; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMTypeRef returns[48]; LLVMValueRef func, ret; + si_init_function_info(&fninfo); + + if (ctx->screen->info.chip_class >= GFX9) { + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + for (unsigned i = 0; i < num_sgprs; ++i) { - params[i] = ctx->i32; + add_arg(&fninfo, ARG_SGPR, ctx->i32); returns[i] = ctx->i32; } for (unsigned i = 0; i < num_vgprs; ++i) { - params[num_sgprs + i] = ctx->i32; + add_arg(&fninfo, ARG_VGPR, ctx->i32); returns[num_sgprs + i] = ctx->f32; } /* Create the function. */ si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, - params, num_sgprs + num_vgprs, num_sgprs - 1); + &fninfo, 0); func = ctx->main_fn; + /* Set the full EXEC mask for the prolog, because we are only fiddling + * with registers here. The main shader part will set the correct EXEC + * mask. + */ + if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) + si_init_exec_full_mask(ctx); + /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ @@ -7123,13 +6124,13 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx, } for (unsigned i = 0; i < num_vgprs; i++) { LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); - p = LLVMBuildBitCast(builder, p, ctx->f32, ""); + p = ac_to_float(&ctx->ac, p); ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); } if (key->gs_prolog.states.tri_strip_adj_fix) { /* Remap the input vertices for every other primitive. */ - const unsigned vtx_params[6] = { + const unsigned gfx6_vtx_params[6] = { num_sgprs, num_sgprs + 1, num_sgprs + 3, @@ -7137,18 +6138,53 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx, num_sgprs + 5, num_sgprs + 6 }; + const unsigned gfx9_vtx_params[3] = { + num_sgprs, + num_sgprs + 1, + num_sgprs + 4, + }; + LLVMValueRef vtx_in[6], vtx_out[6]; LLVMValueRef prim_id, rotate; + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16); + vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16); + } + } else { + for (unsigned i = 0; i < 6; i++) + vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]); + } + prim_id = LLVMGetParam(func, num_sgprs + 2); rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); for (unsigned i = 0; i < 6; ++i) { - LLVMValueRef base, rotated, actual; - base = LLVMGetParam(func, vtx_params[i]); - rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]); - actual = LLVMBuildSelect(builder, rotate, rotated, base, ""); - actual = LLVMBuildBitCast(builder, actual, ctx->f32, ""); - ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], ""); + LLVMValueRef base, rotated; + base = vtx_in[i]; + rotated = vtx_in[(i + 4) % 6]; + vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); + } + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef hi, out; + + hi = LLVMBuildShl(builder, vtx_out[i*2+1], + LLVMConstInt(ctx->i32, 16, 0), ""); + out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); + out = ac_to_float(&ctx->ac, out); + ret = LLVMBuildInsertValue(builder, ret, out, + gfx9_vtx_params[i], ""); + } + } else { + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef out; + + out = ac_to_float(&ctx->ac, vtx_out[i]); + ret = LLVMBuildInsertValue(builder, ret, out, + gfx6_vtx_params[i], ""); + } } } @@ -7162,20 +6198,25 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx, static void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, unsigned num_parts, - unsigned main_part) + unsigned main_part, + unsigned next_shader_first_part) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMBuilderRef builder = ctx->gallivm.builder; - /* PS epilog has one arg per color component */ - LLVMTypeRef param_types[48]; - LLVMValueRef out[48]; + LLVMBuilderRef builder = ctx->ac.builder; + /* PS epilog has one arg per color component; gfx9 merged shader + * prologs need to forward 32 user SGPRs. + */ + struct si_function_info fninfo; + LLVMValueRef initial[64], out[64]; LLVMTypeRef function_type; - unsigned num_params; - unsigned num_out; + unsigned num_first_params; + unsigned num_out, initial_num_out; MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */ + MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */ unsigned num_sgprs, num_vgprs; - unsigned last_sgpr_param; unsigned gprs; + struct lp_build_if_state if_state; + + si_init_function_info(&fninfo); for (unsigned i = 0; i < num_parts; ++i) { lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE); @@ -7191,32 +6232,26 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, num_vgprs = 0; function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); - num_params = LLVMCountParamTypes(function_type); + num_first_params = LLVMCountParamTypes(function_type); - for (unsigned i = 0; i < num_params; ++i) { + for (unsigned i = 0; i < num_first_params; ++i) { LLVMValueRef param = LLVMGetParam(parts[0], i); if (ac_is_sgpr_param(param)) { assert(num_vgprs == 0); - num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4; + num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; } else { - num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4; + num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; } } - assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types)); - num_params = 0; - last_sgpr_param = 0; gprs = 0; while (gprs < num_sgprs + num_vgprs) { - LLVMValueRef param = LLVMGetParam(parts[main_part], num_params); - unsigned size; + LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params); + LLVMTypeRef type = LLVMTypeOf(param); + unsigned size = ac_get_type_size(type) / 4; - param_types[num_params] = LLVMTypeOf(param); - if (gprs < num_sgprs) - last_sgpr_param = num_params; - size = llvm_get_type_size(param_types[num_params]) / 4; - num_params++; + add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type); assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); assert(gprs + size <= num_sgprs + num_vgprs && @@ -7225,7 +6260,11 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, gprs += size; } - si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param); + si_create_function(ctx, "wrapper", NULL, 0, &fninfo, + si_get_max_workgroup_size(ctx->shader)); + + if (is_merged_shader(ctx->shader)) + si_init_exec_full_mask(ctx); /* Record the arguments of the function as if they were an output of * a previous part. @@ -7233,11 +6272,11 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, num_out = 0; num_out_sgpr = 0; - for (unsigned i = 0; i < num_params; ++i) { + for (unsigned i = 0; i < fninfo.num_params; ++i) { LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); LLVMTypeRef param_type = LLVMTypeOf(param); - LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32; - unsigned size = llvm_get_type_size(param_type) / 4; + LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32; + unsigned size = ac_get_type_size(param_type) / 4; if (size == 1) { if (param_type != out_type) @@ -7259,19 +6298,33 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, builder, param, LLVMConstInt(ctx->i32, j, 0), ""); } - if (i <= last_sgpr_param) + if (i < fninfo.num_sgpr_params) num_out_sgpr = num_out; } + memcpy(initial, out, sizeof(out)); + initial_num_out = num_out; + initial_num_out_sgpr = num_out_sgpr; + /* Now chain the parts. */ for (unsigned part = 0; part < num_parts; ++part) { LLVMValueRef in[48]; LLVMValueRef ret; LLVMTypeRef ret_type; unsigned out_idx = 0; - - num_params = LLVMCountParams(parts[part]); - assert(num_params <= ARRAY_SIZE(param_types)); + unsigned num_params = LLVMCountParams(parts[part]); + + /* Merged shaders are executed conditionally depending + * on the number of enabled threads passed in the input SGPRs. */ + if (is_merged_shader(ctx->shader) && part == 0) { + LLVMValueRef ena, count = initial[3]; + + count = LLVMBuildAnd(builder, count, + LLVMConstInt(ctx->i32, 0x7f, 0), ""); + ena = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), count, ""); + lp_build_if(&if_state, &ctx->gallivm, ena); + } /* Derive arguments for the next part from outputs of the * previous one. @@ -7285,7 +6338,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, param = LLVMGetParam(parts[part], param_idx); param_type = LLVMTypeOf(param); - param_size = llvm_get_type_size(param_type) / 4; + param_size = ac_get_type_size(param_type) / 4; is_sgpr = ac_is_sgpr_param(param); if (is_sgpr) { @@ -7304,7 +6357,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, if (param_size == 1) arg = out[out_idx]; else - arg = lp_build_gather_values(gallivm, &out[out_idx], param_size); + arg = lp_build_gather_values(&ctx->gallivm, &out[out_idx], param_size); if (LLVMTypeOf(arg) != param_type) { if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { @@ -7320,9 +6373,27 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, } ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); - ret_type = LLVMTypeOf(ret); + + if (is_merged_shader(ctx->shader) && + part + 1 == next_shader_first_part) { + lp_build_endif(&if_state); + + /* The second half of the merged shader should use + * the inputs from the toplevel (wrapper) function, + * not the return value from the last call. + * + * That's because the last call was executed condi- + * tionally, so we can't consume it in the main + * block. + */ + memcpy(out, initial, sizeof(initial)); + num_out = initial_num_out; + num_out_sgpr = initial_num_out_sgpr; + continue; + } /* Extract the returned GPRs. */ + ret_type = LLVMTypeOf(ret); num_out = 0; num_out_sgpr = 0; @@ -7335,6 +6406,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, ""); + assert(num_out < ARRAY_SIZE(out)); out[num_out++] = val; if (LLVMTypeOf(val) == ctx->i32) { @@ -7356,96 +6428,189 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, { struct si_shader_selector *sel = shader->selector; struct si_shader_context ctx; - struct lp_build_tgsi_context *bld_base; - LLVMModuleRef mod; int r = -1; /* Dump TGSI code before doing TGSI->LLVM conversion in case the * conversion fails. */ - if (r600_can_dump_shader(&sscreen->b, sel->info.processor) && - !(sscreen->b.debug_flags & DBG_NO_TGSI)) { - tgsi_dump(sel->tokens, 0); + if (si_can_dump_shader(sscreen, sel->info.processor) && + !(sscreen->debug_flags & DBG(NO_TGSI))) { + if (sel->tokens) + tgsi_dump(sel->tokens, 0); + else + nir_print_shader(sel->nir, stderr); si_dump_streamout(&sel->so); } - si_init_shader_ctx(&ctx, sscreen, shader, tm); + si_init_shader_ctx(&ctx, sscreen, tm); + si_llvm_context_set_tgsi(&ctx, shader); ctx.separate_prolog = !is_monolithic; - memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED, + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, sizeof(shader->info.vs_output_param_offset)); shader->info.uses_instanceid = sel->info.uses_instanceid; - bld_base = &ctx.bld_base; - ctx.load_system_value = declare_system_value; - - if (!si_compile_tgsi_main(&ctx, shader)) { + if (!si_compile_tgsi_main(&ctx, is_monolithic)) { si_llvm_dispose(&ctx); return -1; } if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { - LLVMValueRef parts[3]; - bool need_prolog; - bool need_epilog; - - need_prolog = sel->info.num_inputs; - need_epilog = !shader->key.as_es && !shader->key.as_ls; + LLVMValueRef parts[2]; + bool need_prolog = sel->vs_needs_prolog; - parts[need_prolog ? 1 : 0] = ctx.main_fn; + parts[1] = ctx.main_fn; if (need_prolog) { union si_shader_part_key prolog_key; - si_get_vs_prolog_key(shader, &prolog_key); + si_get_vs_prolog_key(&sel->info, + shader->info.num_input_sgprs, + &shader->key.part.vs.prolog, + shader, &prolog_key); si_build_vs_prolog_function(&ctx, &prolog_key); parts[0] = ctx.main_fn; } - if (need_epilog) { + si_build_wrapper_function(&ctx, parts + !need_prolog, + 1 + need_prolog, need_prolog, 0); + } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { + if (sscreen->info.chip_class >= GFX9) { + struct si_shader_selector *ls = shader->key.part.tcs.ls; + LLVMValueRef parts[4]; + bool vs_needs_prolog = + si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog); + + /* TCS main part */ + parts[2] = ctx.main_fn; + + /* TCS epilog */ + union si_shader_part_key tcs_epilog_key; + memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); + tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + si_build_tcs_epilog_function(&ctx, &tcs_epilog_key); + parts[3] = ctx.main_fn; + + /* VS prolog */ + if (vs_needs_prolog) { + union si_shader_part_key vs_prolog_key; + si_get_vs_prolog_key(&ls->info, + shader->info.num_input_sgprs, + &shader->key.part.tcs.ls_prolog, + shader, &vs_prolog_key); + vs_prolog_key.vs_prolog.is_monolithic = true; + si_build_vs_prolog_function(&ctx, &vs_prolog_key); + parts[0] = ctx.main_fn; + } + + /* VS as LS main part */ + struct si_shader shader_ls = {}; + shader_ls.selector = ls; + shader_ls.key.as_ls = 1; + shader_ls.key.mono = shader->key.mono; + shader_ls.key.opt = shader->key.opt; + si_llvm_context_set_tgsi(&ctx, &shader_ls); + + if (!si_compile_tgsi_main(&ctx, true)) { + si_llvm_dispose(&ctx); + return -1; + } + shader->info.uses_instanceid |= ls->info.uses_instanceid; + parts[1] = ctx.main_fn; + + /* Reset the shader context. */ + ctx.shader = shader; + ctx.type = PIPE_SHADER_TESS_CTRL; + + si_build_wrapper_function(&ctx, + parts + !vs_needs_prolog, + 4 - !vs_needs_prolog, 0, + vs_needs_prolog ? 2 : 1); + } else { + LLVMValueRef parts[2]; union si_shader_part_key epilog_key; - si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key); - si_build_vs_epilog_function(&ctx, &epilog_key); - parts[need_prolog ? 2 : 1] = ctx.main_fn; + + parts[0] = ctx.main_fn; + + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + si_build_tcs_epilog_function(&ctx, &epilog_key); + parts[1] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 0, 0); } + } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { + if (ctx.screen->info.chip_class >= GFX9) { + struct si_shader_selector *es = shader->key.part.gs.es; + LLVMValueRef es_prolog = NULL; + LLVMValueRef es_main = NULL; + LLVMValueRef gs_prolog = NULL; + LLVMValueRef gs_main = ctx.main_fn; + + /* GS prolog */ + union si_shader_part_key gs_prolog_key; + memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); + gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + gs_prolog_key.gs_prolog.is_monolithic = true; + si_build_gs_prolog_function(&ctx, &gs_prolog_key); + gs_prolog = ctx.main_fn; + + /* ES prolog */ + if (es->vs_needs_prolog) { + union si_shader_part_key vs_prolog_key; + si_get_vs_prolog_key(&es->info, + shader->info.num_input_sgprs, + &shader->key.part.gs.vs_prolog, + shader, &vs_prolog_key); + vs_prolog_key.vs_prolog.is_monolithic = true; + si_build_vs_prolog_function(&ctx, &vs_prolog_key); + es_prolog = ctx.main_fn; + } - si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog, - need_prolog ? 1 : 0); - } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { - LLVMValueRef parts[2]; - union si_shader_part_key epilog_key; + /* ES main part */ + struct si_shader shader_es = {}; + shader_es.selector = es; + shader_es.key.as_es = 1; + shader_es.key.mono = shader->key.mono; + shader_es.key.opt = shader->key.opt; + si_llvm_context_set_tgsi(&ctx, &shader_es); - parts[0] = ctx.main_fn; + if (!si_compile_tgsi_main(&ctx, true)) { + si_llvm_dispose(&ctx); + return -1; + } + shader->info.uses_instanceid |= es->info.uses_instanceid; + es_main = ctx.main_fn; - memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; - si_build_tcs_epilog_function(&ctx, &epilog_key); - parts[1] = ctx.main_fn; + /* Reset the shader context. */ + ctx.shader = shader; + ctx.type = PIPE_SHADER_GEOMETRY; - si_build_wrapper_function(&ctx, parts, 2, 0); - } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && - !shader->key.as_es) { - LLVMValueRef parts[2]; - union si_shader_part_key epilog_key; + /* Prepare the array of shader parts. */ + LLVMValueRef parts[4]; + unsigned num_parts = 0, main_part, next_first_part; - parts[0] = ctx.main_fn; + if (es_prolog) + parts[num_parts++] = es_prolog; - si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key); - si_build_vs_epilog_function(&ctx, &epilog_key); - parts[1] = ctx.main_fn; + parts[main_part = num_parts++] = es_main; + parts[next_first_part = num_parts++] = gs_prolog; + parts[num_parts++] = gs_main; - si_build_wrapper_function(&ctx, parts, 2, 0); - } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { - LLVMValueRef parts[2]; - union si_shader_part_key prolog_key; + si_build_wrapper_function(&ctx, parts, num_parts, + main_part, next_first_part); + } else { + LLVMValueRef parts[2]; + union si_shader_part_key prolog_key; - parts[1] = ctx.main_fn; + parts[1] = ctx.main_fn; - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - si_build_gs_prolog_function(&ctx, &prolog_key); - parts[0] = ctx.main_fn; + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + si_build_gs_prolog_function(&ctx, &prolog_key); + parts[0] = ctx.main_fn; - si_build_wrapper_function(&ctx, parts, 2, 1); + si_build_wrapper_function(&ctx, parts, 2, 1, 0); + } } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { LLVMValueRef parts[3]; union si_shader_part_key prolog_key; @@ -7466,29 +6631,22 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_build_ps_epilog_function(&ctx, &epilog_key); parts[need_prolog ? 2 : 1] = ctx.main_fn; - si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0); + si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, + need_prolog ? 1 : 0, 0); } - mod = bld_base->base.gallivm->module; - - /* Dump LLVM IR before any optimization passes */ - if (sscreen->b.debug_flags & DBG_PREOPT_IR && - r600_can_dump_shader(&sscreen->b, ctx.type)) - ac_dump_module(mod); - - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, ctx.type)); + si_llvm_optimize_module(&ctx); /* Post-optimization transformations and analysis. */ - si_eliminate_const_vs_outputs(&ctx); + si_optimize_vs_outputs(&ctx); if ((debug && debug->debug_message) || - r600_can_dump_shader(&sscreen->b, ctx.type)) + si_can_dump_shader(sscreen, ctx.type)) si_count_scratch_private_memory(&ctx); /* Compile to bytecode. */ r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, - mod, debug, ctx.type, "TGSI shader"); + ctx.gallivm.module, debug, ctx.type, "TGSI shader"); si_llvm_dispose(&ctx); if (r) { fprintf(stderr, "LLVM failed to compile shader\n"); @@ -7501,7 +6659,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, if (sel->type == PIPE_SHADER_COMPUTE) { unsigned wave_size = 64; unsigned max_vgprs = 256; - unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; + unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512; unsigned max_sgprs_per_wave = 128; unsigned max_block_threads = si_get_max_workgroup_size(shader); unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); @@ -7527,13 +6685,14 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, } /* Add the scratch offset to input SGPRs. */ - if (shader->config.scratch_bytes_per_wave) + if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader)) shader->info.num_input_sgprs += 1; /* scratch byte offset */ /* Calculate the number of fragment input VGPRs. */ if (ctx.type == PIPE_SHADER_FRAGMENT) { shader->info.num_input_vgprs = 0; shader->info.face_vgpr_index = -1; + shader->info.ancillary_vgpr_index = -1; if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) shader->info.num_input_vgprs += 2; @@ -7563,8 +6722,10 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, shader->info.face_vgpr_index = shader->info.num_input_vgprs; shader->info.num_input_vgprs += 1; } - if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) + if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) { + shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs; shader->info.num_input_vgprs += 1; + } if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) shader->info.num_input_vgprs += 1; if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) @@ -7617,13 +6778,15 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader shader = {}; struct si_shader_context ctx; - struct gallivm_state *gallivm = &ctx.gallivm; - si_init_shader_ctx(&ctx, sscreen, &shader, tm); + si_init_shader_ctx(&ctx, sscreen, tm); + ctx.shader = &shader; ctx.type = type; switch (type) { case PIPE_SHADER_VERTEX: + shader.key.as_ls = key->vs_prolog.as_ls; + shader.key.as_es = key->vs_prolog.as_es; break; case PIPE_SHADER_TESS_CTRL: assert(!prolog); @@ -7645,11 +6808,10 @@ si_get_shader_part(struct si_screen *sscreen, build(&ctx, key); /* Compile. */ - si_llvm_finalize_module(&ctx, - r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT)); + si_llvm_optimize_module(&ctx); if (si_compile_llvm(sscreen, &result->binary, &result->config, tm, - gallivm->module, debug, ctx.type, name)) { + ctx.ac.module, debug, ctx.type, name)) { FREE(result); result = NULL; goto out; @@ -7664,6 +6826,25 @@ out: return result; } +static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) +{ + LLVMValueRef ptr[2], list; + bool is_merged_shader = + ctx->screen->info.chip_class >= GFX9 && + (ctx->type == PIPE_SHADER_TESS_CTRL || + ctx->type == PIPE_SHADER_GEOMETRY || + ctx->shader->key.as_ls || ctx->shader->key.as_es); + + /* Get the pointer to rw buffers. */ + ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); + ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS_HI); + list = lp_build_gather_values(&ctx->gallivm, ptr, 2); + list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, ""); + list = LLVMBuildIntToPtr(ctx->ac.builder, list, + si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), ""); + return list; +} + /** * Build the vertex shader prolog function. * @@ -7683,34 +6864,33 @@ out: static void si_build_vs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMTypeRef *params, *returns; + struct si_function_info fninfo; + LLVMTypeRef *returns; LLVMValueRef ret, func; - int last_sgpr, num_params, num_returns, i; + int num_returns, i; + unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; + unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4; + LLVMValueRef input_vgprs[9]; + unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + + num_input_vgprs; + unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; - ctx->param_vertex_id = key->vs_prolog.num_input_sgprs; - ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3; + si_init_function_info(&fninfo); /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ - params = alloca((key->vs_prolog.num_input_sgprs + 4) * - sizeof(LLVMTypeRef)); - returns = alloca((key->vs_prolog.num_input_sgprs + 4 + - key->vs_prolog.last_input + 1) * + returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) * sizeof(LLVMTypeRef)); - num_params = 0; num_returns = 0; /* Declare input and output SGPRs. */ - num_params = 0; for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - params[num_params++] = ctx->i32; + add_arg(&fninfo, ARG_SGPR, ctx->i32); returns[num_returns++] = ctx->i32; } - last_sgpr = num_params - 1; - /* 4 preloaded VGPRs (outputs must be floats) */ - for (i = 0; i < 4; i++) { - params[num_params++] = ctx->i32; + /* Preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < num_input_vgprs; i++) { + add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]); returns[num_returns++] = ctx->f32; } @@ -7719,119 +6899,122 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, returns[num_returns++] = ctx->f32; /* Create the function. */ - si_create_function(ctx, "vs_prolog", returns, num_returns, params, - num_params, last_sgpr); + si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0); func = ctx->main_fn; + if (key->vs_prolog.num_merged_next_stage_vgprs) { + if (!key->vs_prolog.is_monolithic) + si_init_exec_from_input(ctx, 3, 0); + + if (key->vs_prolog.as_ls && + ctx->screen->has_ls_vgpr_init_bug) { + /* If there are no HS threads, SPI loads the LS VGPRs + * starting at VGPR 0. Shift them back to where they + * belong. + */ + LLVMValueRef has_hs_threads = + LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, + unpack_param(ctx, 3, 8, 8), + ctx->i32_0, ""); + + for (i = 4; i > 0; --i) { + input_vgprs[i + 1] = + LLVMBuildSelect(ctx->ac.builder, has_hs_threads, + input_vgprs[i + 1], + input_vgprs[i - 1], ""); + } + } + } + + ctx->abi.vertex_id = input_vgprs[first_vs_vgpr]; + ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)]; + /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ ret = ctx->return_value; for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); } - for (i = num_params - 4; i < num_params; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, ""); - ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + for (i = 0; i < num_input_vgprs; i++) { + LLVMValueRef p = input_vgprs[i]; + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, + key->vs_prolog.num_input_sgprs + i, ""); } /* Compute vertex load indices from instance divisors. */ + LLVMValueRef instance_divisor_constbuf = NULL; + + if (key->vs_prolog.states.instance_divisor_is_fetched) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + LLVMValueRef buf_index = + LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); + instance_divisor_constbuf = + ac_build_load_to_sgpr(&ctx->ac, list, buf_index); + } + for (i = 0; i <= key->vs_prolog.last_input; i++) { - unsigned divisor = key->vs_prolog.states.instance_divisors[i]; + bool divisor_is_one = + key->vs_prolog.states.instance_divisor_is_one & (1u << i); + bool divisor_is_fetched = + key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); LLVMValueRef index; - if (divisor) { + if (divisor_is_one || divisor_is_fetched) { + LLVMValueRef divisor = ctx->i32_1; + + if (divisor_is_fetched) { + divisor = buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->i32, i * 4, 0)); + divisor = ac_to_integer(&ctx->ac, divisor); + } + /* InstanceID / Divisor + StartInstance */ index = get_instance_index_for_fetch(ctx, + user_sgpr_base + SI_SGPR_START_INSTANCE, divisor); } else { /* VertexID + BaseVertex */ - index = LLVMBuildAdd(gallivm->builder, - LLVMGetParam(func, ctx->param_vertex_id), - LLVMGetParam(func, SI_SGPR_BASE_VERTEX), ""); + index = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + LLVMGetParam(func, user_sgpr_base + + SI_SGPR_BASE_VERTEX), ""); } - index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, ""); - ret = LLVMBuildInsertValue(gallivm->builder, ret, index, - num_params++, ""); + index = ac_to_float(&ctx->ac, index); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, + fninfo.num_params + i, ""); } si_llvm_build_ret(ctx, ret); } -/** - * Build the vertex shader epilog function. This is also used by the tessellation - * evaluation shader compiled as VS. - * - * The input is PrimitiveID. - * - * If PrimitiveID is required by the pixel shader, export it. - * Otherwise, do nothing. - */ -static void si_build_vs_epilog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) -{ - struct gallivm_state *gallivm = &ctx->gallivm; - struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - LLVMTypeRef params[5]; - int num_params, i; - - /* Declare input VGPRs. */ - num_params = key->vs_epilog.states.export_prim_id ? - (VS_EPILOG_PRIMID_LOC + 1) : 0; - assert(num_params <= ARRAY_SIZE(params)); - - for (i = 0; i < num_params; i++) - params[i] = ctx->f32; - - /* Create the function. */ - si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1); - - /* Emit exports. */ - if (key->vs_epilog.states.export_prim_id) { - struct lp_build_context *base = &bld_base->base; - struct ac_export_args args; - - args.enabled_channels = 0x1; /* enabled channels */ - args.valid_mask = 0; /* whether the EXEC mask is valid */ - args.done = 0; /* DONE bit */ - args.target = V_008DFC_SQ_EXP_PARAM + - key->vs_epilog.prim_id_param_offset; - args.compr = 0; /* COMPR flag (0 = 32-bit export) */ - args.out[0] = LLVMGetParam(ctx->main_fn, - VS_EPILOG_PRIMID_LOC); /* X */ - args.out[1] = base->undef; /* Y */ - args.out[2] = base->undef; /* Z */ - args.out[3] = base->undef; /* W */ - - ac_build_export(&ctx->ac, &args); - } - - LLVMBuildRetVoid(gallivm->builder); -} - -/** - * Create & compile a vertex shader epilog. This a helper used by VS and TES. - */ -static bool si_get_vs_epilog(struct si_screen *sscreen, +static bool si_get_vs_prolog(struct si_screen *sscreen, LLVMTargetMachineRef tm, - struct si_shader *shader, - struct pipe_debug_callback *debug, - struct si_vs_epilog_bits *states) + struct si_shader *shader, + struct pipe_debug_callback *debug, + struct si_shader *main_part, + const struct si_vs_prolog_bits *key) { - union si_shader_part_key epilog_key; + struct si_shader_selector *vs = main_part->selector; - si_get_vs_epilog_key(shader, states, &epilog_key); + if (!si_vs_needs_prolog(vs, key)) + return true; - shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs, - PIPE_SHADER_VERTEX, true, - &epilog_key, tm, debug, - si_build_vs_epilog_function, - "Vertex Shader Epilog"); - return shader->epilog != NULL; + /* Get the prolog. */ + union si_shader_part_key prolog_key; + si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, + key, shader, &prolog_key); + + shader->prolog = + si_get_shader_part(sscreen, &sscreen->vs_prologs, + PIPE_SHADER_VERTEX, true, &prolog_key, tm, + debug, si_build_vs_prolog_function, + "Vertex Shader Prolog"); + return shader->prolog != NULL; } /** @@ -7842,47 +7025,8 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug) { - struct tgsi_shader_info *info = &shader->selector->info; - union si_shader_part_key prolog_key; - - /* Get the prolog. */ - si_get_vs_prolog_key(shader, &prolog_key); - - /* The prolog is a no-op if there are no inputs. */ - if (info->num_inputs) { - shader->prolog = - si_get_shader_part(sscreen, &sscreen->vs_prologs, - PIPE_SHADER_VERTEX, true, - &prolog_key, tm, debug, - si_build_vs_prolog_function, - "Vertex Shader Prolog"); - if (!shader->prolog) - return false; - } - - /* Get the epilog. */ - if (!shader->key.as_es && !shader->key.as_ls && - !si_get_vs_epilog(sscreen, tm, shader, debug, - &shader->key.part.vs.epilog)) - return false; - - return true; -} - -/** - * Select and compile (or reuse) TES parts (epilog). - */ -static bool si_shader_select_tes_parts(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct si_shader *shader, - struct pipe_debug_callback *debug) -{ - if (shader->key.as_es) - return true; - - /* TES compiled as VS. */ - return si_get_vs_epilog(sscreen, tm, shader, debug, - &shader->key.part.tes.epilog); + return si_get_vs_prolog(sscreen, tm, shader, debug, shader, + &shader->key.part.vs.prolog); } /** @@ -7892,42 +7036,76 @@ static bool si_shader_select_tes_parts(struct si_screen *sscreen, static void si_build_tcs_epilog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - struct gallivm_state *gallivm = &ctx->gallivm; struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - LLVMTypeRef params[16]; + struct si_function_info fninfo; LLVMValueRef func; - int last_sgpr, num_params; - - /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ - params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); - params[SI_PARAM_CONST_BUFFERS] = ctx->i64; - params[SI_PARAM_SAMPLERS] = ctx->i64; - params[SI_PARAM_IMAGES] = ctx->i64; - params[SI_PARAM_SHADER_BUFFERS] = ctx->i64; - params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; - params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; - params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; - params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32; - params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32; - params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32; - last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; - num_params = last_sgpr + 1; - - params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */ - params[num_params++] = ctx->i32; /* invocation ID within the patch */ - params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */ + + si_init_function_info(&fninfo); + + if (ctx->screen->info.chip_class >= GFX9) { + add_arg(&fninfo, ARG_SGPR, ctx->i64); + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */ + ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + } else { + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg(&fninfo, ARG_SGPR, ctx->i64); + ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); + } + + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ + unsigned tess_factors_idx = + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */ + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */ + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */ + + for (unsigned i = 0; i < 6; i++) + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */ /* Create the function. */ - si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr); - declare_tess_lds(ctx); + si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo, + ctx->screen->info.chip_class >= CIK ? 128 : 64); + ac_declare_lds_as_pointer(&ctx->ac); func = ctx->main_fn; + LLVMValueRef invoc0_tess_factors[6]; + for (unsigned i = 0; i < 6; i++) + invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i); + si_write_tess_factors(bld_base, - LLVMGetParam(func, last_sgpr + 1), - LLVMGetParam(func, last_sgpr + 2), - LLVMGetParam(func, last_sgpr + 3)); + LLVMGetParam(func, tess_factors_idx), + LLVMGetParam(func, tess_factors_idx + 1), + LLVMGetParam(func, tess_factors_idx + 2), + invoc0_tess_factors, invoc0_tess_factors + 4); - LLVMBuildRetVoid(gallivm->builder); + LLVMBuildRetVoid(ctx->ac.builder); } /** @@ -7938,9 +7116,19 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug) { - union si_shader_part_key epilog_key; + if (sscreen->info.chip_class >= GFX9) { + struct si_shader *ls_main_part = + shader->key.part.tcs.ls->main_shader_part_ls; + + if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part, + &shader->key.part.tcs.ls_prolog)) + return false; + + shader->previous_stage = ls_main_part; + } /* Get the epilog. */ + union si_shader_part_key epilog_key; memset(&epilog_key, 0, sizeof(epilog_key)); epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; @@ -7960,20 +7148,31 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug) { - union si_shader_part_key prolog_key; + if (sscreen->info.chip_class >= GFX9) { + struct si_shader *es_main_part = + shader->key.part.gs.es->main_shader_part_es; + + if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX && + !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part, + &shader->key.part.gs.vs_prolog)) + return false; + + shader->previous_stage = es_main_part; + } if (!shader->key.part.gs.prolog.tri_strip_adj_fix) return true; + union si_shader_part_key prolog_key; memset(&prolog_key, 0, sizeof(prolog_key)); prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs, + shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key, tm, debug, si_build_gs_prolog_function, "Geometry Shader Prolog"); - return shader->prolog != NULL; + return shader->prolog2 != NULL; } /** @@ -7989,45 +7188,39 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, static void si_build_ps_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - struct gallivm_state *gallivm = &ctx->gallivm; - LLVMTypeRef *params; + struct si_function_info fninfo; LLVMValueRef ret, func; - int last_sgpr, num_params, num_returns, i, num_color_channels; + int num_returns, i, num_color_channels; assert(si_need_ps_prolog(key)); - /* Number of inputs + 8 color elements. */ - params = alloca((key->ps_prolog.num_input_sgprs + - key->ps_prolog.num_input_vgprs + 8) * - sizeof(LLVMTypeRef)); + si_init_function_info(&fninfo); /* Declare inputs. */ - num_params = 0; for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) - params[num_params++] = ctx->i32; - last_sgpr = num_params - 1; + add_arg(&fninfo, ARG_SGPR, ctx->i32); for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) - params[num_params++] = ctx->f32; + add_arg(&fninfo, ARG_VGPR, ctx->f32); /* Declare outputs (same as inputs + add colors if needed) */ - num_returns = num_params; + num_returns = fninfo.num_params; num_color_channels = util_bitcount(key->ps_prolog.colors_read); for (i = 0; i < num_color_channels; i++) - params[num_returns++] = ctx->f32; + fninfo.types[num_returns++] = ctx->f32; /* Create the function. */ - si_create_function(ctx, "ps_prolog", params, num_returns, params, - num_params, last_sgpr); + si_create_function(ctx, "ps_prolog", fninfo.types, num_returns, + &fninfo, 0); func = ctx->main_fn; /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ ret = ctx->return_value; - for (i = 0; i < num_params; i++) { + for (i = 0; i < fninfo.num_params; i++) { LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); } /* Polygon stippling. */ @@ -8035,15 +7228,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, /* POS_FIXED_PT is always last. */ unsigned pos = key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs - 1; - LLVMValueRef ptr[2], list; - - /* Get the pointer to rw buffers. */ - ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS); - ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI); - list = lp_build_gather_values(gallivm, ptr, 2); - list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, ""); - list = LLVMBuildIntToPtr(gallivm->builder, list, - const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), ""); + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); si_llvm_emit_polygon_stipple(ctx, list, pos); } @@ -8060,9 +7245,9 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, * PRIM_MASK is after user SGPRs. */ bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); - bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize, + bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->i32, 31, 0), ""); - bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize, + bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->i1, ""); if (key->ps_prolog.states.bc_optimize_for_persp) { @@ -8074,9 +7259,9 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, centroid[i] = LLVMGetParam(func, base + 4 + i); /* Select PERSP_CENTROID. */ for (i = 0; i < 2; i++) { - tmp = LLVMBuildSelect(gallivm->builder, bc_optimize, + tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], ""); - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 4 + i, ""); } } @@ -8089,9 +7274,9 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, centroid[i] = LLVMGetParam(func, base + 10 + i); /* Select LINEAR_CENTROID. */ for (i = 0; i < 2; i++) { - tmp = LLVMBuildSelect(gallivm->builder, bc_optimize, + tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], ""); - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 10 + i, ""); } } @@ -8107,11 +7292,11 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, persp_sample[i] = LLVMGetParam(func, base + i); /* Overwrite PERSP_CENTER. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 2 + i, ""); /* Overwrite PERSP_CENTROID. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 4 + i, ""); } if (key->ps_prolog.states.force_linear_sample_interp) { @@ -8123,11 +7308,11 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, linear_sample[i] = LLVMGetParam(func, base + 6 + i); /* Overwrite LINEAR_CENTER. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 8 + i, ""); /* Overwrite LINEAR_CENTROID. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 10 + i, ""); } @@ -8141,11 +7326,11 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, persp_center[i] = LLVMGetParam(func, base + 2 + i); /* Overwrite PERSP_SAMPLE. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + i, ""); /* Overwrite PERSP_CENTROID. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + 4 + i, ""); } if (key->ps_prolog.states.force_linear_center_interp) { @@ -8157,15 +7342,16 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, linear_center[i] = LLVMGetParam(func, base + 8 + i); /* Overwrite LINEAR_SAMPLE. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 6 + i, ""); /* Overwrite LINEAR_CENTROID. */ for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(gallivm->builder, ret, + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 10 + i, ""); } /* Interpolate colors. */ + unsigned color_out_idx = 0; for (i = 0; i < 2; i++) { unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; unsigned face_vgpr = key->ps_prolog.num_input_sgprs + @@ -8182,11 +7368,11 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, key->ps_prolog.color_interp_vgpr_index[i]; /* Get the (i,j) updated by bc_optimize handling. */ - interp[0] = LLVMBuildExtractValue(gallivm->builder, ret, + interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr, ""); - interp[1] = LLVMBuildExtractValue(gallivm->builder, ret, + interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr + 1, ""); - interp_ij = lp_build_gather_values(gallivm, interp, 2); + interp_ij = lp_build_gather_values(&ctx->gallivm, interp, 2); } /* Use the absolute location of the input. */ @@ -8194,7 +7380,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, if (key->ps_prolog.states.color_two_side) { face = LLVMGetParam(func, face_vgpr); - face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, ""); + face = ac_to_integer(&ctx->ac, face); } interp_fs_input(ctx, @@ -8206,11 +7392,59 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, while (writemask) { unsigned chan = u_bit_scan(&writemask); - ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan], - num_params++, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], + fninfo.num_params + color_out_idx++, ""); } } + /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec + * says: + * + * "When per-sample shading is active due to the use of a fragment + * input qualified by sample or due to the use of the gl_SampleID + * or gl_SamplePosition variables, only the bit for the current + * sample is set in gl_SampleMaskIn. When state specifies multiple + * fragment shader invocations for a given fragment, the sample + * mask for any single fragment shader invocation may specify a + * subset of the covered samples for the fragment. In this case, + * the bit corresponding to each covered sample will be set in + * exactly one fragment shader invocation." + * + * The samplemask loaded by hardware is always the coverage of the + * entire pixel/fragment, so mask bits out based on the sample ID. + */ + if (key->ps_prolog.states.samplemask_log_ps_iter) { + /* The bit pattern matches that used by fixed function fragment + * processing. */ + static const uint16_t ps_iter_masks[] = { + 0xffff, /* not used */ + 0x5555, + 0x1111, + 0x0101, + 0x0001, + }; + assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); + + uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; + unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.ancillary_vgpr_index; + LLVMValueRef sampleid = unpack_param(ctx, ancillary_vgpr, 8, 4); + LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1); + + samplemask = ac_to_integer(&ctx->ac, samplemask); + samplemask = LLVMBuildAnd( + ctx->ac.builder, + samplemask, + LLVMBuildShl(ctx->ac.builder, + LLVMConstInt(ctx->i32, ps_iter_mask, false), + sampleid, ""), + ""); + samplemask = ac_to_float(&ctx->ac, samplemask); + + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, + ancillary_vgpr + 1, ""); + } + /* Tell LLVM to insert WQM instruction sequence when needed. */ if (key->ps_prolog.wqm) { LLVMAddTargetDependentFunctionAttr(func, @@ -8227,45 +7461,43 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, static void si_build_ps_epilog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - struct gallivm_state *gallivm = &ctx->gallivm; struct lp_build_tgsi_context *bld_base = &ctx->bld_base; - LLVMTypeRef params[16+8*4+3]; + struct si_function_info fninfo; LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; - int last_sgpr, num_params, i; + int i; struct si_ps_exports exp = {}; + si_init_function_info(&fninfo); + /* Declare input SGPRs. */ - params[SI_PARAM_RW_BUFFERS] = ctx->i64; - params[SI_PARAM_CONST_BUFFERS] = ctx->i64; - params[SI_PARAM_SAMPLERS] = ctx->i64; - params[SI_PARAM_IMAGES] = ctx->i64; - params[SI_PARAM_SHADER_BUFFERS] = ctx->i64; - params[SI_PARAM_ALPHA_REF] = ctx->f32; - last_sgpr = SI_PARAM_ALPHA_REF; + ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64); + ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64); + ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64); + ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64); + add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); /* Declare input VGPRs. */ - num_params = (last_sgpr + 1) + + unsigned required_num_params = + fninfo.num_sgpr_params + util_bitcount(key->ps_epilog.colors_written) * 4 + key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask; - num_params = MAX2(num_params, - last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + required_num_params = MAX2(required_num_params, + fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); - assert(num_params <= ARRAY_SIZE(params)); - - for (i = last_sgpr + 1; i < num_params; i++) - params[i] = ctx->f32; + while (fninfo.num_params < required_num_params) + add_arg(&fninfo, ARG_VGPR, ctx->f32); /* Create the function. */ - si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr); + si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0); /* Disable elimination of unused inputs. */ si_llvm_add_attribute(ctx->main_fn, "InitialPSInputAddr", 0xffffff); /* Process colors. */ - unsigned vgpr = last_sgpr + 1; + unsigned vgpr = fninfo.num_sgpr_params; unsigned colors_written = key->ps_epilog.colors_written; int last_color_export = -1; @@ -8279,7 +7511,7 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx, if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { /* Just set this if any of the colorbuffers are enabled. */ if (spi_format & - ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) last_color_export = 0; } else { for (i = 0; i < 8; i++) @@ -8297,7 +7529,7 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx, color[i] = LLVMGetParam(ctx->main_fn, vgpr++); si_export_mrt_color(bld_base, color, mrt, - num_params - 1, + fninfo.num_params - 1, mrt == last_color_export, &exp); } @@ -8318,7 +7550,7 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx, si_emit_ps_exports(ctx, &exp); /* Compile. */ - LLVMBuildRetVoid(gallivm->builder); + LLVMBuildRetVoid(ctx->ac.builder); } /** @@ -8408,6 +7640,12 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); } + /* Samplemask fixup requires the sample ID. */ + if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { + shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); + assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); + } + /* The sample mask input is always enabled, because the API shader always * passes it through to the epilog. Disable it here if it's unused. */ @@ -8425,9 +7663,9 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, * Make sure we have at least 4k of LDS in use to avoid the bug. * It applies to workgroup sizes of more than one wavefront. */ - if (sscreen->b.family == CHIP_BONAIRE || - sscreen->b.family == CHIP_KABINI || - sscreen->b.family == CHIP_MULLINS) + if (sscreen->info.family == CHIP_BONAIRE || + sscreen->info.family == CHIP_KABINI || + sscreen->info.family == CHIP_MULLINS) *lds_size = MAX2(*lds_size, 8); } @@ -8467,7 +7705,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, if (r) return r; } else { - /* The shader consists of 2-3 parts: + /* The shader consists of several parts: * * - the middle part is the user shader, it has 1 variant only * and it was compiled during the creation of the shader @@ -8476,8 +7714,15 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, * - the epilog part is inserted at the end * * The prolog and epilog have many (but simple) variants. + * + * Starting with gfx9, geometry and tessellation control + * shaders also contain the prolog and user shader parts of + * the previous shader stage. */ + if (!mainp) + return -1; + /* Copy the compiled TGSI shader data over. */ shader->is_binary_shared = true; shader->binary = mainp->binary; @@ -8485,6 +7730,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, shader->info.num_input_sgprs = mainp->info.num_input_sgprs; shader->info.num_input_vgprs = mainp->info.num_input_vgprs; shader->info.face_vgpr_index = mainp->info.face_vgpr_index; + shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset, sizeof(mainp->info.vs_output_param_offset)); @@ -8503,8 +7749,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, return -1; break; case PIPE_SHADER_TESS_EVAL: - if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) - return -1; break; case PIPE_SHADER_GEOMETRY: if (!si_shader_select_gs_parts(sscreen, tm, shader, debug)) @@ -8529,6 +7773,32 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs); } + if (shader->previous_stage) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->previous_stage->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->previous_stage->config.num_vgprs); + shader->config.spilled_sgprs = + MAX2(shader->config.spilled_sgprs, + shader->previous_stage->config.spilled_sgprs); + shader->config.spilled_vgprs = + MAX2(shader->config.spilled_vgprs, + shader->previous_stage->config.spilled_vgprs); + shader->config.private_mem_vgprs = + MAX2(shader->config.private_mem_vgprs, + shader->previous_stage->config.private_mem_vgprs); + shader->config.scratch_bytes_per_wave = + MAX2(shader->config.scratch_bytes_per_wave, + shader->previous_stage->config.scratch_bytes_per_wave); + shader->info.uses_instanceid |= + shader->previous_stage->info.uses_instanceid; + } + if (shader->prolog2) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->prolog2->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->prolog2->config.num_vgprs); + } if (shader->epilog) { shader->config.num_sgprs = MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs); @@ -8559,7 +7829,7 @@ void si_shader_destroy(struct si_shader *shader) r600_resource_reference(&shader->bo, NULL); if (!shader->is_binary_shared) - radeon_shader_binary_clean(&shader->binary); + ac_shader_binary_clean(&shader->binary); free(shader->shader_log); }