X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.c;h=9183852a8568e65004c6104ed50903fa3a5285cb;hb=754cf171e90cc61d135e7c45f8be319ee2db02a5;hp=1db3e4849157a95c9e58cbf6304ba0495bd74c06;hpb=69f43c2cc903d5973bab2515be51465c9e8f9f9e;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 1db3e484915..9183852a856 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -67,7 +67,15 @@ struct si_shader_context struct radeon_llvm_context radeon_bld; struct si_shader *shader; struct si_screen *screen; + unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ + bool is_gs_copy_shader; + + /* Whether to generate the optimized shader variant compiled as a whole + * (without a prolog and epilog) + */ + bool is_monolithic; + int param_streamout_config; int param_streamout_write_index; int param_streamout_offset[4]; @@ -75,30 +83,62 @@ struct si_shader_context int param_rel_auto_id; int param_vs_prim_id; int param_instance_id; + int param_vertex_index0; int param_tes_u; int param_tes_v; int param_tes_rel_patch_id; int param_tes_patch_id; int param_es2gs_offset; + LLVMTargetMachineRef tm; + LLVMValueRef const_md; LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS]; LLVMValueRef lds; LLVMValueRef *constants[SI_NUM_CONST_BUFFERS]; - LLVMValueRef sampler_views[SI_NUM_SAMPLER_VIEWS]; - LLVMValueRef sampler_states[SI_NUM_SAMPLER_STATES]; + LLVMValueRef sampler_views[SI_NUM_SAMPLERS]; + LLVMValueRef sampler_states[SI_NUM_SAMPLERS]; + LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS]; LLVMValueRef so_buffers[4]; LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; LLVMValueRef gs_next_vertex[4]; + LLVMValueRef return_value; + + LLVMTypeRef voidt; + LLVMTypeRef i1; + LLVMTypeRef i8; + LLVMTypeRef i32; + LLVMTypeRef i64; + LLVMTypeRef i128; + LLVMTypeRef f32; + LLVMTypeRef v16i8; + LLVMTypeRef v2i32; + LLVMTypeRef v4i32; + LLVMTypeRef v4f32; + LLVMTypeRef v8i32; }; -static struct si_shader_context * si_shader_context( - struct lp_build_tgsi_context * bld_base) +static struct si_shader_context *si_shader_context( + struct lp_build_tgsi_context *bld_base) { return (struct si_shader_context *)bld_base; } +static void si_init_shader_ctx(struct si_shader_context *ctx, + struct si_screen *sscreen, + struct si_shader *shader, + LLVMTargetMachineRef tm); + +/* Ideally pass the sample mask input to the PS epilog as v13, which + * is its usual location, so that the shader doesn't have to add v_mov. + */ +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 + +/* The VS location of the PrimitiveID input is the same in the epilog, + * so that the main shader part doesn't have to move it. + */ +#define VS_EPILOG_PRIMID_LOC 2 #define PERSPECTIVE_BASE 0 #define LINEAR_BASE 9 @@ -166,14 +206,18 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) /** * Get the value of a shader input parameter and extract a bitfield. */ -static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx, +static LLVMValueRef unpack_param(struct si_shader_context *ctx, unsigned param, unsigned rshift, unsigned bitwidth) { - struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; - LLVMValueRef value = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn, param); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = bitcast(&ctx->radeon_bld.soa.bld_base, + TGSI_TYPE_UNSIGNED, value); + if (rshift) value = LLVMBuildLShr(gallivm->builder, value, lp_build_const_int32(gallivm, rshift), ""); @@ -187,15 +231,15 @@ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx, return value; } -static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx) +static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) { - switch (si_shader_ctx->type) { + switch (ctx->type) { case TGSI_PROCESSOR_TESS_CTRL: - return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8); + return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8); case TGSI_PROCESSOR_TESS_EVAL: - return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_tes_rel_patch_id); + return LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_tes_rel_patch_id); default: assert(0); @@ -225,12 +269,12 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx) */ static LLVMValueRef -get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx) +get_tcs_in_patch_stride(struct si_shader_context *ctx) { - if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) - return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13); - else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL) - return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13); + if (ctx->type == TGSI_PROCESSOR_VERTEX) + return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13); + else if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) + return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13); else { assert(0); return NULL; @@ -238,48 +282,48 @@ get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx) } static LLVMValueRef -get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx) +get_tcs_out_patch_stride(struct si_shader_context *ctx) { - return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13); + return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13); } static LLVMValueRef -get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx) +get_tcs_out_patch0_offset(struct si_shader_context *ctx) { - return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld, - unpack_param(si_shader_ctx, + return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld, + unpack_param(ctx, SI_PARAM_TCS_OUT_OFFSETS, 0, 16), 4); } static LLVMValueRef -get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx) +get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) { - return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld, - unpack_param(si_shader_ctx, + return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld, + unpack_param(ctx, SI_PARAM_TCS_OUT_OFFSETS, 16, 16), 4); } static LLVMValueRef -get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx) +get_tcs_in_current_patch_offset(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; - LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx); + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, ""); } static LLVMValueRef -get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx) +get_tcs_out_current_patch_offset(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; - LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx); - LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx); + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); return LLVMBuildAdd(gallivm->builder, patch0_offset, LLVMBuildMul(gallivm->builder, patch_stride, @@ -288,13 +332,13 @@ get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx) } static LLVMValueRef -get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx) +get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) { - struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; LLVMValueRef patch0_patch_data_offset = - get_tcs_out_patch0_patch_data_offset(si_shader_ctx); - LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx); + get_tcs_out_patch0_patch_data_offset(ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset, LLVMBuildMul(gallivm->builder, patch_stride, @@ -302,11 +346,11 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx) ""); } -static void build_indexed_store(struct si_shader_context *si_shader_ctx, +static void build_indexed_store(struct si_shader_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, LLVMValueRef value) { - struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef indices[2], pointer; @@ -324,10 +368,10 @@ static void build_indexed_store(struct si_shader_context *si_shader_ctx, * \param base_ptr Where the array starts. * \param index The element index into the array. */ -static LLVMValueRef build_indexed_load(struct si_shader_context *si_shader_ctx, +static LLVMValueRef build_indexed_load(struct si_shader_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef indices[2], pointer; @@ -343,32 +387,32 @@ static LLVMValueRef build_indexed_load(struct si_shader_context *si_shader_ctx, * a constant. */ static LLVMValueRef build_indexed_load_const( - struct si_shader_context * si_shader_ctx, + struct si_shader_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - LLVMValueRef result = build_indexed_load(si_shader_ctx, base_ptr, index); - LLVMSetMetadata(result, 1, si_shader_ctx->const_md); + LLVMValueRef result = build_indexed_load(ctx, base_ptr, index); + LLVMSetMetadata(result, 1, ctx->const_md); return result; } static LLVMValueRef get_instance_index_for_fetch( - struct radeon_llvm_context * radeon_bld, - unsigned divisor) + struct radeon_llvm_context *radeon_bld, + unsigned param_start_instance, unsigned divisor) { - struct si_shader_context *si_shader_ctx = + struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); - struct gallivm_state * gallivm = radeon_bld->soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm; LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn, - si_shader_ctx->param_instance_id); + ctx->param_instance_id); /* The division must be done before START_INSTANCE is added. */ if (divisor > 1) result = LLVMBuildUDiv(gallivm->builder, result, lp_build_const_int32(gallivm, divisor), ""); - return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam( - radeon_bld->main_fn, SI_PARAM_START_INSTANCE), ""); + return LLVMBuildAdd(gallivm->builder, result, + LLVMGetParam(radeon_bld->main_fn, param_start_instance), ""); } static void declare_input_vs( @@ -378,9 +422,10 @@ static void declare_input_vs( { struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct gallivm_state *gallivm = base->gallivm; - struct si_shader_context *si_shader_ctx = + struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); - unsigned divisor = si_shader_ctx->shader->key.vs.instance_divisors[input_index]; + unsigned divisor = + ctx->shader->key.vs.prolog.instance_divisors[input_index]; unsigned chan; @@ -390,38 +435,42 @@ static void declare_input_vs( LLVMValueRef attribute_offset; LLVMValueRef buffer_index; LLVMValueRef args[3]; - LLVMTypeRef vec4_type; LLVMValueRef input; /* Load the T list */ - t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS); + t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS); t_offset = lp_build_const_int32(gallivm, input_index); - t_list = build_indexed_load_const(si_shader_ctx, t_list_ptr, t_offset); + t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset); /* Build the attribute offset */ attribute_offset = lp_build_const_int32(gallivm, 0); - if (divisor) { + if (!ctx->is_monolithic) { + buffer_index = LLVMGetParam(radeon_bld->main_fn, + ctx->param_vertex_index0 + + input_index); + } else if (divisor) { /* Build index from instance ID, start instance and divisor */ - si_shader_ctx->shader->uses_instanceid = true; - buffer_index = get_instance_index_for_fetch(&si_shader_ctx->radeon_bld, divisor); + ctx->shader->uses_instanceid = true; + buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld, + SI_PARAM_START_INSTANCE, + divisor); } else { /* Load the buffer index for vertices. */ - LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_vertex_id); + LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_vertex_id); LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BASE_VERTEX); buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, ""); } - vec4_type = LLVMVectorType(base->elem_type, 4); args[0] = t_list; args[1] = attribute_offset; args[2] = buffer_index; input = lp_build_intrinsic(gallivm->builder, - "llvm.SI.vs.load.input", vec4_type, args, 3, + "llvm.SI.vs.load.input", ctx->v4f32, args, 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); /* Break up the vec4 into individual components */ @@ -429,7 +478,7 @@ static void declare_input_vs( LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); /* XXX: Use a helper function for this. There is one in * tgsi_llvm.c. */ - si_shader_ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] = + ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] = LLVMBuildExtractElement(gallivm->builder, input, llvm_chan, ""); } @@ -438,23 +487,23 @@ static void declare_input_vs( static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, unsigned swizzle) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); if (swizzle > 0) return bld_base->uint_bld.zero; - switch (si_shader_ctx->type) { + switch (ctx->type) { case TGSI_PROCESSOR_VERTEX: - return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_vs_prim_id); + return LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_vs_prim_id); case TGSI_PROCESSOR_TESS_CTRL: - return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + return LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PATCH_ID); case TGSI_PROCESSOR_TESS_EVAL: - return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_tes_patch_id); + return LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_tes_patch_id); case TGSI_PROCESSOR_GEOMETRY: - return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + return LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIMITIVE_ID); default: assert(0); @@ -466,14 +515,14 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, * Return the value of tgsi_ind_register for indexing. * This is the indirect index with the constant offset added to it. */ -static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx, +static LLVMValueRef get_indirect_index(struct si_shader_context *ctx, const struct tgsi_ind_register *ind, int rel_index) { - struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; + struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm; LLVMValueRef result; - result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle]; + result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle]; result = LLVMBuildLoad(gallivm->builder, result, ""); result = LLVMBuildAdd(gallivm->builder, result, lp_build_const_int32(gallivm, rel_index), ""); @@ -483,14 +532,14 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx, /** * Calculate a dword address given an input or output register and a stride. */ -static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx, +static LLVMValueRef get_dw_address(struct si_shader_context *ctx, const struct tgsi_full_dst_register *dst, const struct tgsi_full_src_register *src, LLVMValueRef vertex_dw_stride, LLVMValueRef base_addr) { - struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; - struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info; + struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm; + struct tgsi_shader_info *info = &ctx->shader->selector->info; ubyte *name, *index, *array_first; int first, param; struct tgsi_full_dst_register reg; @@ -514,7 +563,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx, LLVMValueRef index; if (reg.Dimension.Indirect) - index = get_indirect_index(si_shader_ctx, ®.DimIndirect, + index = get_indirect_index(ctx, ®.DimIndirect, reg.Dimension.Index); else index = lp_build_const_int32(gallivm, reg.Dimension.Index); @@ -547,7 +596,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx, else first = reg.Register.Index; - ind_index = get_indirect_index(si_shader_ctx, ®.Indirect, + ind_index = get_indirect_index(ctx, ®.Indirect, reg.Register.Index - first); base_addr = LLVMBuildAdd(gallivm->builder, base_addr, @@ -576,7 +625,7 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type, unsigned swizzle, LLVMValueRef dw_addr) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef value; @@ -593,12 +642,12 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, lp_build_const_int32(gallivm, swizzle)); - value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr); + value = build_indexed_load(ctx, ctx->lds, dw_addr); if (type == TGSI_TYPE_DOUBLE) { LLVMValueRef value2; dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, lp_build_const_int32(gallivm, swizzle + 1)); - value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr); + value2 = build_indexed_load(ctx, ctx->lds, dw_addr); return radeon_llvm_emit_fetch_double(bld_base, value, value2); } @@ -613,19 +662,18 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, * \param dw_addr address in dwords * \param value value to store */ -static void lds_store(struct lp_build_tgsi_context * bld_base, +static void lds_store(struct lp_build_tgsi_context *bld_base, unsigned swizzle, LLVMValueRef dw_addr, LLVMValueRef value) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, lp_build_const_int32(gallivm, swizzle)); - value = LLVMBuildBitCast(gallivm->builder, value, - LLVMInt32TypeInContext(gallivm->context), ""); - build_indexed_store(si_shader_ctx, si_shader_ctx->lds, + value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); + build_indexed_store(ctx, ctx->lds, dw_addr, value); } @@ -634,12 +682,12 @@ static LLVMValueRef fetch_input_tcs( const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef dw_addr, stride; - stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); - dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr); + stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); + dw_addr = get_tcs_in_current_patch_offset(ctx); + dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); return lds_load(bld_base, type, swizzle, dw_addr); } @@ -649,16 +697,16 @@ static LLVMValueRef fetch_output_tcs( const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef dw_addr, stride; if (reg->Register.Dimension) { - stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); - dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr); + stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(ctx); + dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); } else { - dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr); + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); } return lds_load(bld_base, type, swizzle, dw_addr); @@ -669,27 +717,27 @@ static LLVMValueRef fetch_input_tes( const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef dw_addr, stride; if (reg->Register.Dimension) { - stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); - dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr); + stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(ctx); + dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); } else { - dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr); + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); } return lds_load(bld_base, type, swizzle, dw_addr); } -static void store_output_tcs(struct lp_build_tgsi_context * bld_base, - const struct tgsi_full_instruction * inst, - const struct tgsi_opcode_info * info, +static void store_output_tcs(struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_instruction *inst, + const struct tgsi_opcode_info *info, LLVMValueRef dst[4]) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); const struct tgsi_full_dst_register *reg = &inst->Dst[0]; unsigned chan_index; LLVMValueRef dw_addr, stride; @@ -704,12 +752,12 @@ static void store_output_tcs(struct lp_build_tgsi_context * bld_base, } if (reg->Register.Dimension) { - stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); - dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr); + stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); + dw_addr = get_tcs_out_current_patch_offset(ctx); + dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); } else { - dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); - dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr); + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); } TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) { @@ -729,11 +777,10 @@ static LLVMValueRef fetch_input_gs( unsigned swizzle) { struct lp_build_context *base = &bld_base->base; - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); - struct si_shader *shader = si_shader_ctx->shader; - struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld; struct gallivm_state *gallivm = base->gallivm; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef vtx_offset; LLVMValueRef args[9]; unsigned vtx_offset_param; @@ -768,12 +815,12 @@ static LLVMValueRef fetch_input_gs( vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2; } vtx_offset = lp_build_mul_imm(uint, - LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + LLVMGetParam(ctx->radeon_bld.main_fn, vtx_offset_param), 4); param = si_shader_io_get_unique_index(semantic_name, semantic_index); - args[0] = si_shader_ctx->esgs_ring; + args[0] = ctx->esgs_ring; args[1] = vtx_offset; args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256); args[3] = uint->zero; @@ -785,14 +832,14 @@ static LLVMValueRef fetch_input_gs( value = lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", - i32, args, 9, + ctx->i32, args, 9, LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); if (type == TGSI_TYPE_DOUBLE) { LLVMValueRef value2; args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256); value2 = lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", - i32, args, 9, + ctx->i32, args, 9, LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); return radeon_llvm_emit_fetch_double(bld_base, value, value2); @@ -832,14 +879,12 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location) } /* This shouldn't be used by explicit INTERP opcodes. */ -static LLVMValueRef get_interp_param(struct si_shader_context *si_shader_ctx, - unsigned param) +static unsigned select_interp_param(struct si_shader_context *ctx, + unsigned param) { - struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; - unsigned sample_param = 0; - LLVMValueRef default_ij, sample_ij, force_sample; - - default_ij = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, param); + if (!ctx->shader->key.ps.prolog.force_persample_interp || + !ctx->is_monolithic) + return param; /* If the shader doesn't use center/centroid, just return the parameter. * @@ -849,109 +894,51 @@ static LLVMValueRef get_interp_param(struct si_shader_context *si_shader_ctx, switch (param) { case SI_PARAM_PERSP_CENTROID: case SI_PARAM_PERSP_CENTER: - if (!si_shader_ctx->shader->selector->forces_persample_interp_for_persp) - return default_ij; - - sample_param = SI_PARAM_PERSP_SAMPLE; - break; + return SI_PARAM_PERSP_SAMPLE; case SI_PARAM_LINEAR_CENTROID: case SI_PARAM_LINEAR_CENTER: - if (!si_shader_ctx->shader->selector->forces_persample_interp_for_linear) - return default_ij; - - sample_param = SI_PARAM_LINEAR_SAMPLE; - break; + return SI_PARAM_LINEAR_SAMPLE; default: - return default_ij; + return param; } - - /* Otherwise, we have to select (i,j) based on a user data SGPR. */ - sample_ij = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, sample_param); - - /* TODO: this can be done more efficiently by switching between - * 2 prologs. - */ - force_sample = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - SI_PARAM_PS_STATE_BITS); - force_sample = LLVMBuildTrunc(gallivm->builder, force_sample, - LLVMInt1TypeInContext(gallivm->context), ""); - return LLVMBuildSelect(gallivm->builder, force_sample, - sample_ij, default_ij, ""); } -static void declare_input_fs( - struct radeon_llvm_context *radeon_bld, - unsigned input_index, - const struct tgsi_full_declaration *decl) +/** + * Interpolate a fragment shader input. + * + * @param ctx context + * @param input_index index of the input in hardware + * @param semantic_name TGSI_SEMANTIC_* + * @param semantic_index semantic index + * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) + * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) + * @param interp_param interpolation weights (i,j) + * @param prim_mask SI_PARAM_PRIM_MASK + * @param face SI_PARAM_FRONT_FACE + * @param result the return value (4 components) + */ +static void interp_fs_input(struct si_shader_context *ctx, + unsigned input_index, + unsigned semantic_name, + unsigned semantic_index, + unsigned num_interp_inputs, + unsigned colors_read_mask, + LLVMValueRef interp_param, + LLVMValueRef prim_mask, + LLVMValueRef face, + LLVMValueRef result[4]) { - struct lp_build_context *base = &radeon_bld->soa.bld_base.base; - struct si_shader_context *si_shader_ctx = - si_shader_context(&radeon_bld->soa.bld_base); - struct si_shader *shader = si_shader_ctx->shader; - struct lp_build_context *uint = &radeon_bld->soa.bld_base.uint_bld; + struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base; + struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld; struct gallivm_state *gallivm = base->gallivm; - LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context); - LLVMValueRef main_fn = radeon_bld->main_fn; - - LLVMValueRef interp_param = NULL; - int interp_param_idx; - const char * intr_name; - - /* This value is: - * [15:0] NewPrimMask (Bit mask for each quad. It is set it the - * quad begins a new primitive. Bit 0 always needs - * to be unset) - * [32:16] ParamOffset - * - */ - LLVMValueRef params = LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK); + const char *intr_name; LLVMValueRef attr_number; unsigned chan; - if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) { - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - unsigned soa_index = - radeon_llvm_reg_index_soa(input_index, chan); - radeon_bld->inputs[soa_index] = - LLVMGetParam(main_fn, SI_PARAM_POS_X_FLOAT + chan); - - if (chan == 3) - /* RCP for fragcoord.w */ - radeon_bld->inputs[soa_index] = - LLVMBuildFDiv(gallivm->builder, - lp_build_const_float(gallivm, 1.0f), - radeon_bld->inputs[soa_index], - ""); - } - return; - } - - if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) { - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = - LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE); - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = - lp_build_const_float(gallivm, 0.0f); - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = - lp_build_const_float(gallivm, 1.0f); - - return; - } - - shader->ps_input_param_offset[input_index] = shader->nparam++; - attr_number = lp_build_const_int32(gallivm, - shader->ps_input_param_offset[input_index]); - - shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate; - interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, - decl->Interp.Location); - if (interp_param_idx == -1) - return; - else if (interp_param_idx) - interp_param = get_interp_param(si_shader_ctx, interp_param_idx); + attr_number = lp_build_const_int32(gallivm, input_index); /* fs.constant returns the param from the middle vertex, so it's not * really useful for flat shading. It's meant to be used for custom @@ -965,81 +952,127 @@ static void declare_input_fs( */ intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant"; - if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR && - si_shader_ctx->shader->key.ps.color_two_side) { + if (semantic_name == TGSI_SEMANTIC_COLOR && + ctx->shader->key.ps.prolog.color_two_side) { LLVMValueRef args[4]; - LLVMValueRef face, is_face_positive; - LLVMValueRef back_attr_number = - lp_build_const_int32(gallivm, - shader->ps_input_param_offset[input_index] + 1); + LLVMValueRef is_face_positive; + LLVMValueRef back_attr_number; - face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE); + /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", + * otherwise it's at offset "num_inputs". + */ + unsigned back_attr_offset = num_interp_inputs; + if (semantic_index == 1 && colors_read_mask & 0xf) + back_attr_offset += 1; - is_face_positive = LLVMBuildFCmp(gallivm->builder, - LLVMRealOGT, face, - lp_build_const_float(gallivm, 0.0f), - ""); + back_attr_number = lp_build_const_int32(gallivm, back_attr_offset); - args[2] = params; + is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE, + face, uint->zero, ""); + + args[2] = prim_mask; args[3] = interp_param; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); - unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan); LLVMValueRef front, back; args[0] = llvm_chan; args[1] = attr_number; front = lp_build_intrinsic(gallivm->builder, intr_name, - input_type, args, args[3] ? 4 : 3, + ctx->f32, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); args[1] = back_attr_number; back = lp_build_intrinsic(gallivm->builder, intr_name, - input_type, args, args[3] ? 4 : 3, + ctx->f32, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); - radeon_bld->inputs[soa_index] = - LLVMBuildSelect(gallivm->builder, + result[chan] = LLVMBuildSelect(gallivm->builder, is_face_positive, front, back, ""); } - - shader->nparam++; - } else if (decl->Semantic.Name == TGSI_SEMANTIC_FOG) { + } else if (semantic_name == TGSI_SEMANTIC_FOG) { LLVMValueRef args[4]; args[0] = uint->zero; args[1] = attr_number; - args[2] = params; + args[2] = prim_mask; args[3] = interp_param; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = - lp_build_intrinsic(gallivm->builder, intr_name, - input_type, args, args[3] ? 4 : 3, + result[0] = lp_build_intrinsic(gallivm->builder, intr_name, + ctx->f32, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = - lp_build_const_float(gallivm, 0.0f); - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = - lp_build_const_float(gallivm, 1.0f); + result[1] = + result[2] = lp_build_const_float(gallivm, 0.0f); + result[3] = lp_build_const_float(gallivm, 1.0f); } else { for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { LLVMValueRef args[4]; LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); - unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan); + args[0] = llvm_chan; args[1] = attr_number; - args[2] = params; + args[2] = prim_mask; args[3] = interp_param; - radeon_bld->inputs[soa_index] = - lp_build_intrinsic(gallivm->builder, intr_name, - input_type, args, args[3] ? 4 : 3, + result[chan] = lp_build_intrinsic(gallivm->builder, intr_name, + ctx->f32, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); } } } +static void declare_input_fs( + struct radeon_llvm_context *radeon_bld, + unsigned input_index, + const struct tgsi_full_declaration *decl) +{ + struct lp_build_context *base = &radeon_bld->soa.bld_base.base; + struct si_shader_context *ctx = + si_shader_context(&radeon_bld->soa.bld_base); + struct si_shader *shader = ctx->shader; + LLVMValueRef main_fn = radeon_bld->main_fn; + LLVMValueRef interp_param = NULL; + int interp_param_idx; + + /* Get colors from input VGPRs (set by the prolog). */ + if (!ctx->is_monolithic && + decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { + unsigned i = decl->Semantic.Index; + unsigned colors_read = shader->selector->info.colors_read; + unsigned mask = colors_read >> (i * 4); + unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + + (i ? util_bitcount(colors_read & 0xf) : 0); + + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = + mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = + mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = + mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = + mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; + return; + } + + interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, + decl->Interp.Location); + if (interp_param_idx == -1) + return; + else if (interp_param_idx) { + interp_param_idx = select_interp_param(ctx, + interp_param_idx); + interp_param = LLVMGetParam(main_fn, interp_param_idx); + } + + interp_fs_input(ctx, input_index, decl->Semantic.Name, + decl->Semantic.Index, shader->selector->info.num_inputs, + shader->selector->info.colors_read, interp_param, + LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK), + LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), + &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]); +} + static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld) { return unpack_param(si_shader_context(&radeon_bld->soa.bld_base), @@ -1060,22 +1093,22 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id) { - struct si_shader_context *si_shader_ctx = + struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld; struct gallivm_state *gallivm = &radeon_bld->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); + LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF); - LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index); + LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index); /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8); LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), ""); LLVMValueRef pos[4] = { - buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type), - buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type), + buffer_load_const(builder, resource, offset0, ctx->f32), + buffer_load_const(builder, resource, offset1, ctx->f32), lp_build_const_float(gallivm, 0), lp_build_const_float(gallivm, 0) }; @@ -1084,34 +1117,33 @@ static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, } static void declare_system_value( - struct radeon_llvm_context * radeon_bld, + struct radeon_llvm_context *radeon_bld, unsigned index, const struct tgsi_full_declaration *decl) { - struct si_shader_context *si_shader_ctx = + struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); struct lp_build_context *bld = &radeon_bld->soa.bld_base.base; - struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld; struct gallivm_state *gallivm = &radeon_bld->gallivm; LLVMValueRef value = 0; switch (decl->Semantic.Name) { case TGSI_SEMANTIC_INSTANCEID: value = LLVMGetParam(radeon_bld->main_fn, - si_shader_ctx->param_instance_id); + ctx->param_instance_id); break; case TGSI_SEMANTIC_VERTEXID: value = LLVMBuildAdd(gallivm->builder, LLVMGetParam(radeon_bld->main_fn, - si_shader_ctx->param_vertex_id), + ctx->param_vertex_id), LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BASE_VERTEX), ""); break; case TGSI_SEMANTIC_VERTEXID_NOBASE: value = LLVMGetParam(radeon_bld->main_fn, - si_shader_ctx->param_vertex_id); + ctx->param_vertex_id); break; case TGSI_SEMANTIC_BASEVERTEX: @@ -1120,43 +1152,70 @@ static void declare_system_value( break; case TGSI_SEMANTIC_INVOCATIONID: - if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL) - value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5); - else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) + if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) + value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GS_INSTANCE_ID); else assert(!"INVOCATIONID not implemented"); break; + case TGSI_SEMANTIC_POSITION: + { + LLVMValueRef pos[4] = { + LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT), + LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT), + LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT), + lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP, + LLVMGetParam(radeon_bld->main_fn, + SI_PARAM_POS_W_FLOAT)), + }; + value = lp_build_gather_values(gallivm, pos, 4); + break; + } + + case TGSI_SEMANTIC_FACE: + value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE); + break; + case TGSI_SEMANTIC_SAMPLEID: value = get_sample_id(radeon_bld); break; - case TGSI_SEMANTIC_SAMPLEPOS: - value = load_sample_position(radeon_bld, get_sample_id(radeon_bld)); + case TGSI_SEMANTIC_SAMPLEPOS: { + LLVMValueRef pos[4] = { + LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT), + LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT), + lp_build_const_float(gallivm, 0), + lp_build_const_float(gallivm, 0) + }; + pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, + TGSI_OPCODE_FRC, pos[0]); + pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, + TGSI_OPCODE_FRC, pos[1]); + value = lp_build_gather_values(gallivm, pos, 4); break; + } case TGSI_SEMANTIC_SAMPLEMASK: - /* Smoothing isn't MSAA in GL, but it's MSAA in hardware. - * Therefore, force gl_SampleMaskIn to 1 for GL. */ - if (si_shader_ctx->shader->key.ps.poly_line_smoothing) - value = uint_bld->one; - else - value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE); + /* This can only occur with the OpenGL Core profile, which + * doesn't support smoothing. + */ + value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE); break; case TGSI_SEMANTIC_TESSCOORD: { LLVMValueRef coord[4] = { - LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u), - LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v), + LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u), + LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v), bld->zero, bld->zero }; /* For triangles, the vector should be (u, v, 1-u-v). */ - if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == + if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) coord[2] = lp_build_sub(bld, bld->one, lp_build_add(bld, coord[0], coord[1])); @@ -1166,7 +1225,7 @@ static void declare_system_value( } case TGSI_SEMANTIC_VERTICESIN: - value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6); + value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6); break; case TGSI_SEMANTIC_TESSINNER: @@ -1175,7 +1234,7 @@ static void declare_system_value( LLVMValueRef dw_addr; int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0); - dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx); + dw_addr = get_tcs_out_current_patch_data_offset(ctx); dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr, lp_build_const_int32(gallivm, param * 4), ""); @@ -1197,13 +1256,13 @@ static void declare_system_value( } static LLVMValueRef fetch_constant( - struct lp_build_tgsi_context * bld_base, + struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); - struct lp_build_context * base = &bld_base->base; + struct si_shader_context *ctx = si_shader_context(bld_base); + struct lp_build_context *base = &bld_base->base; const struct tgsi_ind_register *ireg = ®->Indirect; unsigned buf, idx; @@ -1224,44 +1283,44 @@ static LLVMValueRef fetch_constant( if (!reg->Register.Indirect && !reg->Dimension.Indirect) { if (type != TGSI_TYPE_DOUBLE) - return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]); + return bitcast(bld_base, type, ctx->constants[buf][idx]); else { return radeon_llvm_emit_fetch_double(bld_base, - si_shader_ctx->constants[buf][idx], - si_shader_ctx->constants[buf][idx + 1]); + ctx->constants[buf][idx], + ctx->constants[buf][idx + 1]); } } if (reg->Register.Dimension && reg->Dimension.Indirect) { - LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); + LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); LLVMValueRef index; - index = get_indirect_index(si_shader_ctx, ®->DimIndirect, + index = get_indirect_index(ctx, ®->DimIndirect, reg->Dimension.Index); - bufp = build_indexed_load_const(si_shader_ctx, ptr, index); + bufp = build_indexed_load_const(ctx, ptr, index); } else - bufp = si_shader_ctx->const_buffers[buf]; + bufp = ctx->const_buffers[buf]; - addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle]; + addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle]; addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg"); addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16); addr = lp_build_add(&bld_base->uint_bld, addr, lp_build_const_int32(base->gallivm, idx * 4)); result = buffer_load_const(base->gallivm->builder, bufp, - addr, bld_base->base.elem_type); + addr, ctx->f32); if (type != TGSI_TYPE_DOUBLE) result = bitcast(bld_base, type, result); else { LLVMValueRef addr2, result2; - addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1]; + addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1]; addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2"); addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16); addr2 = lp_build_add(&bld_base->uint_bld, addr2, lp_build_const_int32(base->gallivm, idx * 4)); - result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_buffers[buf], - addr2, bld_base->base.elem_type); + result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf], + addr2, ctx->f32); result = radeon_llvm_emit_fetch_double(bld_base, result, result2); @@ -1269,26 +1328,47 @@ static LLVMValueRef fetch_constant( return result; } +/* Upper 16 bits must be zero. */ +static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm, + LLVMValueRef val[2]) +{ + return LLVMBuildOr(gallivm->builder, val[0], + LLVMBuildShl(gallivm->builder, val[1], + lp_build_const_int32(gallivm, 16), + ""), ""); +} + +/* Upper 16 bits are ignored and will be dropped. */ +static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm, + LLVMValueRef val[2]) +{ + LLVMValueRef v[2] = { + LLVMBuildAnd(gallivm->builder, val[0], + lp_build_const_int32(gallivm, 0xffff), ""), + val[1], + }; + return si_llvm_pack_two_int16(gallivm, v); +} + /* Initialize arguments for the shader export intrinsic */ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, LLVMValueRef *values, unsigned target, LLVMValueRef *args) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *uint = - &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; + &ctx->radeon_bld.soa.bld_base.uint_bld; struct lp_build_context *base = &bld_base->base; - unsigned compressed = 0; + struct gallivm_state *gallivm = base->gallivm; + LLVMBuilderRef builder = base->gallivm->builder; + LLVMValueRef val[4]; + unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; unsigned chan; + bool is_int8; - /* XXX: This controls which components of the output - * registers actually get exported. (e.g bit 0 means export - * X component, bit 1 means export Y component, etc.) I'm - * hard coding this to 0xf for now. In the future, we might - * want to do something else. - */ - args[0] = lp_build_const_int32(base->gallivm, 0xf); + /* Default is 0xf. Adjusted below depending on the format. */ + args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */ /* Specify whether the EXEC mask represents the valid mask */ args[1] = uint->zero; @@ -1299,18 +1379,48 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, /* Specify the target we are exporting */ args[3] = lp_build_const_int32(base->gallivm, target); - if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) { + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + const union si_shader_key *key = &ctx->shader->key; + unsigned col_formats = key->ps.epilog.spi_shader_col_format; int cbuf = target - V_008DFC_SQ_EXP_MRT; - if (cbuf >= 0 && cbuf < 8) - compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1; + assert(cbuf >= 0 && cbuf < 8); + spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; + is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1; } - /* Set COMPR flag */ - args[4] = compressed ? uint->one : uint->zero; + args[4] = uint->zero; /* COMPR flag */ + args[5] = base->undef; + args[6] = base->undef; + args[7] = base->undef; + args[8] = base->undef; + + switch (spi_shader_col_format) { + case V_028714_SPI_SHADER_ZERO: + args[0] = uint->zero; /* writemask */ + args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL); + break; + + case V_028714_SPI_SHADER_32_R: + args[0] = uint->one; /* writemask */ + args[5] = values[0]; + break; + + case V_028714_SPI_SHADER_32_GR: + args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */ + args[5] = values[0]; + args[6] = values[1]; + break; + + case V_028714_SPI_SHADER_32_AR: + args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */ + args[5] = values[0]; + args[8] = values[3]; + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + args[4] = uint->one; /* COMPR flag */ - if (compressed) { - /* Pixel shader needs to pack output values before export */ for (chan = 0; chan < 2; chan++) { LLVMValueRef pack_args[2] = { values[2 * chan], @@ -1320,33 +1430,122 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, packed = lp_build_intrinsic(base->gallivm->builder, "llvm.SI.packf16", - LLVMInt32TypeInContext(base->gallivm->context), - pack_args, 2, + ctx->i32, pack_args, 2, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); args[chan + 5] = LLVMBuildBitCast(base->gallivm->builder, - packed, - LLVMFloatTypeInContext(base->gallivm->context), - ""); - args[chan + 7] = base->undef; + packed, ctx->f32, ""); } - } else + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + for (chan = 0; chan < 4; chan++) { + val[chan] = radeon_llvm_saturate(bld_base, values[chan]); + val[chan] = LLVMBuildFMul(builder, val[chan], + lp_build_const_float(gallivm, 65535), ""); + val[chan] = LLVMBuildFAdd(builder, val[chan], + lp_build_const_float(gallivm, 0.5), ""); + val[chan] = LLVMBuildFPToUI(builder, val[chan], + ctx->i32, ""); + } + + args[4] = uint->one; /* COMPR flag */ + args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int16(gallivm, val)); + args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int16(gallivm, val+2)); + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + for (chan = 0; chan < 4; chan++) { + /* Clamp between [-1, 1]. */ + val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN, + values[chan], + lp_build_const_float(gallivm, 1)); + val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX, + val[chan], + lp_build_const_float(gallivm, -1)); + /* Convert to a signed integer in [-32767, 32767]. */ + val[chan] = LLVMBuildFMul(builder, val[chan], + lp_build_const_float(gallivm, 32767), ""); + /* If positive, add 0.5, else add -0.5. */ + val[chan] = LLVMBuildFAdd(builder, val[chan], + LLVMBuildSelect(builder, + LLVMBuildFCmp(builder, LLVMRealOGE, + val[chan], base->zero, ""), + lp_build_const_float(gallivm, 0.5), + lp_build_const_float(gallivm, -0.5), ""), ""); + val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, ""); + } + + args[4] = uint->one; /* COMPR flag */ + args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int32_as_int16(gallivm, val)); + args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int32_as_int16(gallivm, val+2)); + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: { + LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ? + 255 : 65535); + /* Clamp. */ + for (chan = 0; chan < 4; chan++) { + val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]); + val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN, + val[chan], max); + } + + args[4] = uint->one; /* COMPR flag */ + args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int16(gallivm, val)); + args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int16(gallivm, val+2)); + break; + } + + case V_028714_SPI_SHADER_SINT16_ABGR: { + LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ? + 127 : 32767); + LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ? + -128 : -32768); + /* Clamp. */ + for (chan = 0; chan < 4; chan++) { + val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]); + val[chan] = lp_build_emit_llvm_binary(bld_base, + TGSI_OPCODE_IMIN, + val[chan], max); + val[chan] = lp_build_emit_llvm_binary(bld_base, + TGSI_OPCODE_IMAX, + val[chan], min); + } + + args[4] = uint->one; /* COMPR flag */ + args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int32_as_int16(gallivm, val)); + args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, + si_llvm_pack_two_int32_as_int16(gallivm, val+2)); + break; + } + + case V_028714_SPI_SHADER_32_ABGR: memcpy(&args[5], values, sizeof(values[0]) * 4); + break; + } } static void si_alpha_test(struct lp_build_tgsi_context *bld_base, LLVMValueRef alpha) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - if (si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) { - LLVMValueRef alpha_ref = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { + LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_ALPHA_REF); LLVMValueRef alpha_pass = lp_build_cmp(&bld_base->base, - si_shader_ctx->shader->key.ps.alpha_func, + ctx->shader->key.ps.epilog.alpha_func, alpha, alpha_ref); LLVMValueRef arg = lp_build_select(&bld_base->base, @@ -1354,36 +1553,33 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base, lp_build_const_float(gallivm, 1.0f), lp_build_const_float(gallivm, -1.0f)); - lp_build_intrinsic(gallivm->builder, - "llvm.AMDGPU.kill", - LLVMVoidTypeInContext(gallivm->context), - &arg, 1, 0); + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + ctx->voidt, &arg, 1, 0); } else { - lp_build_intrinsic(gallivm->builder, - "llvm.AMDGPU.kilp", - LLVMVoidTypeInContext(gallivm->context), - NULL, 0, 0); + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp", + ctx->voidt, NULL, 0, 0); } } static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, - LLVMValueRef alpha) + LLVMValueRef alpha, + unsigned samplemask_param) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef coverage; /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ - coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - SI_PARAM_SAMPLE_COVERAGE); + coverage = LLVMGetParam(ctx->radeon_bld.main_fn, + samplemask_param); coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage); coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", - bld_base->int_bld.elem_type, + ctx->i32, &coverage, 1, LLVMReadNoneAttribute); coverage = LLVMBuildUIToFP(gallivm->builder, coverage, - bld_base->base.elem_type, ""); + ctx->f32, ""); coverage = LLVMBuildFMul(gallivm->builder, coverage, lp_build_const_float(gallivm, @@ -1392,19 +1588,19 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context * return LLVMBuildFMul(gallivm->builder, alpha, coverage, ""); } -static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base, +static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base, LLVMValueRef (*pos)[9], LLVMValueRef *out_elts) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *base = &bld_base->base; - struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; + struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld; unsigned reg_index; unsigned chan; unsigned const_chan; LLVMValueRef base_elt; - LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); + LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF); - LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index); + LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index); for (reg_index = 0; reg_index < 2; reg_index ++) { LLVMValueRef *args = pos[2 + reg_index]; @@ -1421,7 +1617,7 @@ static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base, ((reg_index * 4 + chan) * 4 + const_chan) * 4); base_elt = buffer_load_const(base->gallivm->builder, const_resource, - args[1], base->elem_type); + args[1], ctx->f32); args[5 + chan] = lp_build_add(base, args[5 + chan], lp_build_mul(base, base_elt, @@ -1462,7 +1658,7 @@ static void si_dump_streamout(struct pipe_stream_output_info *so) /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), * or v4i32 (num_channels=3,4). */ -static void build_tbuffer_store(struct si_shader_context *shader, +static void build_tbuffer_store(struct si_shader_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, unsigned num_channels, @@ -1477,22 +1673,21 @@ static void build_tbuffer_store(struct si_shader_context *shader, unsigned slc, unsigned tfe) { - struct gallivm_state *gallivm = &shader->radeon_bld.gallivm; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; LLVMValueRef args[] = { rsrc, vdata, - LLVMConstInt(i32, num_channels, 0), + LLVMConstInt(ctx->i32, num_channels, 0), vaddr, soffset, - LLVMConstInt(i32, inst_offset, 0), - LLVMConstInt(i32, dfmt, 0), - LLVMConstInt(i32, nfmt, 0), - LLVMConstInt(i32, offen, 0), - LLVMConstInt(i32, idxen, 0), - LLVMConstInt(i32, glc, 0), - LLVMConstInt(i32, slc, 0), - LLVMConstInt(i32, tfe, 0) + LLVMConstInt(ctx->i32, inst_offset, 0), + LLVMConstInt(ctx->i32, dfmt, 0), + LLVMConstInt(ctx->i32, nfmt, 0), + LLVMConstInt(ctx->i32, offen, 0), + LLVMConstInt(ctx->i32, idxen, 0), + LLVMConstInt(ctx->i32, glc, 0), + LLVMConstInt(ctx->i32, slc, 0), + LLVMConstInt(ctx->i32, tfe, 0) }; /* The instruction offset field has 12 bits */ @@ -1504,12 +1699,11 @@ static void build_tbuffer_store(struct si_shader_context *shader, char name[256]; snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]); - lp_build_intrinsic(gallivm->builder, name, - LLVMVoidTypeInContext(gallivm->context), + lp_build_intrinsic(gallivm->builder, name, ctx->voidt, args, Elements(args), 0); } -static void build_tbuffer_store_dwords(struct si_shader_context *shader, +static void build_tbuffer_store_dwords(struct si_shader_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, unsigned num_channels, @@ -1525,30 +1719,28 @@ static void build_tbuffer_store_dwords(struct si_shader_context *shader, }; assert(num_channels >= 1 && num_channels <= 4); - build_tbuffer_store(shader, rsrc, vdata, num_channels, vaddr, soffset, + build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset, inst_offset, dfmt[num_channels-1], V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0); } /* On SI, the vertex shader is responsible for writing streamout data * to buffers. */ -static void si_llvm_emit_streamout(struct si_shader_context *shader, +static void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs, unsigned noutput) { - struct pipe_stream_output_info *so = &shader->shader->selector->so; - struct gallivm_state *gallivm = &shader->radeon_bld.gallivm; + struct pipe_stream_output_info *so = &ctx->shader->selector->so; + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; LLVMBuilderRef builder = gallivm->builder; int i, j; struct lp_build_if_state if_ctx; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); - /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ LLVMValueRef so_vtx_count = - unpack_param(shader, shader->param_streamout_config, 16, 7); + unpack_param(ctx, ctx->param_streamout_config, 16, 7); - LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32, + LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", ctx->i32, NULL, 0, LLVMReadNoneAttribute); /* can_emit = tid < so_vtx_count; */ @@ -1556,7 +1748,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); LLVMValueRef stream_id = - unpack_param(shader, shader->param_streamout_config, 24, 2); + unpack_param(ctx, ctx->param_streamout_config, 24, 2); /* Emit the streamout code conditionally. This actually avoids * out-of-bounds buffer access. The hw tells us via the SGPR @@ -1570,8 +1762,8 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, */ LLVMValueRef so_write_index = - LLVMGetParam(shader->radeon_bld.main_fn, - shader->param_streamout_write_index); + LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_streamout_write_index); /* Compute (streamout_write_index + thread_id). */ so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); @@ -1582,12 +1774,12 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, if (!so->stride[i]) continue; - LLVMValueRef so_offset = LLVMGetParam(shader->radeon_bld.main_fn, - shader->param_streamout_offset[i]); - so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(i32, 4, 0), ""); + LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_streamout_offset[i]); + so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); so_write_offset[i] = LLVMBuildMul(builder, so_write_index, - LLVMConstInt(i32, so->stride[i]*4, 0), ""); + LLVMConstInt(ctx->i32, so->stride[i]*4, 0), ""); so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, ""); } @@ -1612,7 +1804,7 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, for (j = 0; j < num_comps; j++) { out[j] = LLVMBuildBitCast(builder, outputs[reg].values[start+j], - i32, ""); + ctx->i32, ""); } /* Pack the output. */ @@ -1625,10 +1817,10 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, case 2: /* as v2i32 */ case 3: /* as v4i32 (aligned to 4) */ case 4: /* as v4i32 */ - vdata = LLVMGetUndef(LLVMVectorType(i32, util_next_power_of_two(num_comps))); + vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); for (j = 0; j < num_comps; j++) { vdata = LLVMBuildInsertElement(builder, vdata, out[j], - LLVMConstInt(i32, j, 0), ""); + LLVMConstInt(ctx->i32, j, 0), ""); } break; } @@ -1639,10 +1831,10 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader, lp_build_const_int32(gallivm, stream), ""); lp_build_if(&if_ctx_stream, gallivm, can_emit_stream); - build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx], + build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx], vdata, num_comps, so_write_offset[buf_idx], - LLVMConstInt(i32, 0, 0), + LLVMConstInt(ctx->i32, 0, 0), so->output[i].dst_offset*4); lp_build_endif(&if_ctx_stream); } @@ -1656,11 +1848,11 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, struct si_shader_output_values *outputs, unsigned noutput) { - struct si_shader_context * si_shader_ctx = si_shader_context(bld_base); - struct si_shader * shader = si_shader_ctx->shader; - struct lp_build_context * base = &bld_base->base; - struct lp_build_context * uint = - &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = + &ctx->radeon_bld.soa.bld_base.uint_bld; LLVMValueRef args[9]; LLVMValueRef pos_args[4][9] = { { 0 } }; LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; @@ -1670,8 +1862,8 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, unsigned pos_idx; int i; - if (outputs && si_shader_ctx->shader->selector->so.num_outputs) { - si_llvm_emit_streamout(si_shader_ctx, outputs, noutput); + if (outputs && ctx->shader->selector->so.num_outputs) { + si_llvm_emit_streamout(ctx, outputs, noutput); } for (i = 0; i < noutput; i++) { @@ -1733,8 +1925,7 @@ handle_semantic: args, sizeof(args)); } else { lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), + "llvm.SI.export", ctx->voidt, args, 9, 0); } @@ -1786,7 +1977,7 @@ handle_semantic: * with the first bit containing the edge flag. */ edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder, edgeflag_value, - bld_base->uint_bld.elem_type, ""); + ctx->i32, ""); edgeflag_value = lp_build_min(&bld_base->int_bld, edgeflag_value, bld_base->int_bld.one); @@ -1794,7 +1985,7 @@ handle_semantic: /* The LLVM intrinsic expects a float. */ pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder, edgeflag_value, - base->elem_type, ""); + ctx->f32, ""); } if (shader->selector->info.writes_layer) @@ -1820,28 +2011,25 @@ handle_semantic: /* Specify that this is the last export */ pos_args[i][2] = uint->one; - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - pos_args[i], 9, 0); + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + ctx->voidt, pos_args[i], 9, 0); } } -/* This only writes the tessellation factor levels. */ -static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) +static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, + LLVMValueRef rel_patch_id, + LLVMValueRef invocation_id, + LLVMValueRef tcs_out_current_patch_data_offset) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - struct si_shader *shader = si_shader_ctx->shader; + struct si_shader *shader = ctx->shader; unsigned tess_inner_index, tess_outer_index; - LLVMValueRef lds_base, lds_inner, lds_outer; - LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers; - LLVMValueRef out[6], vec0, vec1, invocation_id; + LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; + LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base; unsigned stride, outer_comps, inner_comps, i; struct lp_build_if_state if_ctx; - invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5); - /* Do this only for invocation 0, because the tess levels are per-patch, * not per-vertex. * @@ -1853,7 +2041,7 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) invocation_id, bld_base->uint_bld.zero, "")); /* Determine the layout of one tess factor element in the buffer. */ - switch (shader->key.tcs.prim_mode) { + switch (shader->key.tcs.epilog.prim_mode) { case PIPE_PRIM_LINES: stride = 2; /* 2 dwords, 1 vec2 store */ outer_comps = 2; @@ -1880,7 +2068,7 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0); tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0); - lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx); + lds_base = tcs_out_current_patch_data_offset; lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, lp_build_const_int32(gallivm, tess_inner_index * 4), ""); @@ -1901,45 +2089,95 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) vec1 = lp_build_gather_values(gallivm, out+4, stride - 4); /* Get the buffer. */ - rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS); - buffer = build_indexed_load_const(si_shader_ctx, rw_buffers, + buffer = build_indexed_load_const(ctx, rw_buffers, lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR)); /* Get the offset. */ - tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + tf_base = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_TESS_FACTOR_OFFSET); - rel_patch_id = get_rel_patch_id(si_shader_ctx); byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id, lp_build_const_int32(gallivm, 4 * stride), ""); /* Store the outputs. */ - build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0, + build_tbuffer_store_dwords(ctx, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, 0); if (vec1) - build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1, + build_tbuffer_store_dwords(ctx, buffer, vec1, stride - 4, byteoffset, tf_base, 16); lp_build_endif(&if_ctx); } -static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base) +/* This only writes the tessellation factor levels. */ +static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + + rel_patch_id = get_rel_patch_id(ctx); + invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); + + if (!ctx->is_monolithic) { + /* Return epilog parameters from this function. */ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef ret = ctx->return_value; + LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; + unsigned vgpr; + + /* RW_BUFFERS pointer */ + rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_RW_BUFFERS); + rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); + rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); + rw0 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.zero, ""); + rw1 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.one, ""); + ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); + ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); + + /* Tess factor buffer soffset is after user SGPRs. */ + tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_TESS_FACTOR_OFFSET); + ret = LLVMBuildInsertValue(builder, ret, tf_soffset, + SI_TCS_NUM_USER_SGPR, ""); + + /* VGPRs */ + rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); + invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); + tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); + + vgpr = SI_TCS_NUM_USER_SGPR + 1; + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + ctx->return_value = ret; + return; + } + + si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset); +} + +static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); - struct si_shader *shader = si_shader_ctx->shader; + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; struct tgsi_shader_info *info = &shader->selector->info; struct gallivm_state *gallivm = bld_base->base.gallivm; unsigned i, chan; - LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_rel_auto_id); + LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_rel_auto_id); LLVMValueRef vertex_dw_stride = - unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8); + unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8); LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id, vertex_dw_stride, ""); /* Write outputs to LDS. The next shader (TCS aka HS) will read * its inputs from it. */ for (i = 0; i < info->num_outputs; i++) { - LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i]; + LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i]; unsigned name = info->output_semantic_name[i]; unsigned index = info->output_semantic_index[i]; int param = si_shader_io_get_unique_index(name, index); @@ -1953,21 +2191,20 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base) } } -static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) +static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - struct si_shader *es = si_shader_ctx->shader; + struct si_shader *es = ctx->shader; struct tgsi_shader_info *info = &es->selector->info; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); - LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_es2gs_offset); + LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_es2gs_offset); unsigned chan; int i; for (i = 0; i < info->num_outputs; i++) { LLVMValueRef *out_ptr = - si_shader_ctx->radeon_bld.soa.outputs[i]; + ctx->radeon_bld.soa.outputs[i]; int param_index; if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || @@ -1979,12 +2216,12 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) for (chan = 0; chan < 4; chan++) { LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); - out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, ""); + out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); - build_tbuffer_store(si_shader_ctx, - si_shader_ctx->esgs_ring, + build_tbuffer_store(ctx, + ctx->esgs_ring, out_val, 1, - LLVMGetUndef(i32), soffset, + LLVMGetUndef(ctx->i32), soffset, (4 * param_index + chan) * 4, V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_NUM_FORMAT_UINT, @@ -1995,25 +2232,26 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef args[2]; args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE); - args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); + args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", - LLVMVoidTypeInContext(gallivm->context), args, 2, - LLVMNoUnwindAttribute); + ctx->voidt, args, 2, LLVMNoUnwindAttribute); } -static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) +static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info; + struct tgsi_shader_info *info = &ctx->shader->selector->info; struct si_shader_output_values *outputs = NULL; int i,j; + assert(!ctx->is_gs_copy_shader); + outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); /* Vertex color clamping. @@ -2022,8 +2260,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) * an IF statement is added that clamps all colors if the constant * is true. */ - if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && - !si_shader_ctx->shader->is_gs_copy_shader) { + if (ctx->type == TGSI_PROCESSOR_VERTEX) { struct lp_build_if_state if_ctx; LLVMValueRef cond = NULL; LLVMValueRef addr, val; @@ -2036,15 +2273,15 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) /* We've found a color. */ if (!cond) { /* The state is in the first bit of the user SGPR. */ - cond = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + cond = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VS_STATE_BITS); cond = LLVMBuildTrunc(gallivm->builder, cond, - LLVMInt1TypeInContext(gallivm->context), ""); + ctx->i1, ""); lp_build_if(&if_ctx, gallivm, cond); } for (j = 0; j < 4; j++) { - addr = si_shader_ctx->radeon_bld.soa.outputs[i][j]; + addr = ctx->radeon_bld.soa.outputs[i][j]; val = LLVMBuildLoad(gallivm->builder, addr, ""); val = radeon_llvm_saturate(bld_base, val); LLVMBuildStore(gallivm->builder, val, addr); @@ -2062,20 +2299,30 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) for (j = 0; j < 4; j++) outputs[i].values[j] = LLVMBuildLoad(gallivm->builder, - si_shader_ctx->radeon_bld.soa.outputs[i][j], + ctx->radeon_bld.soa.outputs[i][j], ""); } - /* Export PrimitiveID when PS needs it. */ - if (si_vs_exports_prim_id(si_shader_ctx->shader)) { - outputs[i].name = TGSI_SEMANTIC_PRIMID; - outputs[i].sid = 0; - outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, - get_primitive_id(bld_base, 0)); - outputs[i].values[1] = bld_base->base.undef; - outputs[i].values[2] = bld_base->base.undef; - outputs[i].values[3] = bld_base->base.undef; - i++; + if (ctx->is_monolithic) { + /* Export PrimitiveID when PS needs it. */ + if (si_vs_exports_prim_id(ctx->shader)) { + outputs[i].name = TGSI_SEMANTIC_PRIMID; + outputs[i].sid = 0; + outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)); + outputs[i].values[1] = bld_base->base.undef; + outputs[i].values[2] = bld_base->base.undef; + outputs[i].values[3] = bld_base->base.undef; + i++; + } + } else { + /* Return the primitive ID from the LLVM function. */ + ctx->return_value = + LLVMBuildInsertValue(gallivm->builder, + ctx->return_value, + bitcast(bld_base, TGSI_TYPE_FLOAT, + get_primitive_id(bld_base, 0)), + VS_EPILOG_PRIMID_LOC, ""); } si_llvm_export_vs(bld_base, outputs, i); @@ -2086,7 +2333,7 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, LLVMValueRef depth, LLVMValueRef stencil, LLVMValueRef samplemask) { - struct si_screen *sscreen = si_shader_context(bld_base)->screen; + struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *base = &bld_base->base; struct lp_build_context *uint = &bld_base->uint_bld; LLVMValueRef args[9]; @@ -2123,71 +2370,89 @@ static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, /* SI (except OLAND) has a bug that it only looks * at the X writemask component. */ - if (sscreen->b.chip_class == SI && - sscreen->b.family != CHIP_OLAND) + if (ctx->screen->b.chip_class == SI && + ctx->screen->b.family != CHIP_OLAND) mask |= 0x1; /* Specify which components to enable */ args[0] = lp_build_const_int32(base->gallivm, mask); lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - args, 9, 0); + ctx->voidt, args, 9, 0); } static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, LLVMValueRef *color, unsigned index, + unsigned samplemask_param, bool is_last) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *base = &bld_base->base; - LLVMValueRef args[9]; int i; /* Clamp color */ - if (si_shader_ctx->shader->key.ps.clamp_color) + if (ctx->shader->key.ps.epilog.clamp_color) for (i = 0; i < 4; i++) color[i] = radeon_llvm_saturate(bld_base, color[i]); /* Alpha to one */ - if (si_shader_ctx->shader->key.ps.alpha_to_one) + if (ctx->shader->key.ps.epilog.alpha_to_one) color[3] = base->one; /* Alpha test */ if (index == 0 && - si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS) + ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) si_alpha_test(bld_base, color[3]); /* Line & polygon smoothing */ - if (si_shader_ctx->shader->key.ps.poly_line_smoothing) - color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3]); + if (ctx->shader->key.ps.epilog.poly_line_smoothing) + color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], + samplemask_param); /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (index == 0 && - si_shader_ctx->shader->key.ps.last_cbuf > 0) { - for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) { + if (ctx->shader->key.ps.epilog.last_cbuf > 0) { + LLVMValueRef args[8][9]; + int c, last = -1; + + /* Get the export arguments, also find out what the last one is. */ + for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { si_llvm_init_export_args(bld_base, color, - V_008DFC_SQ_EXP_MRT + c, args); - lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - args, 9, 0); + V_008DFC_SQ_EXP_MRT + c, args[c]); + if (args[c][0] != bld_base->uint_bld.zero) + last = c; } - } - /* Export */ - si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index, - args); - if (is_last) { - args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ - args[2] = bld_base->uint_bld.one; /* DONE bit */ + /* Emit all exports. */ + for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) { + if (is_last && last == c) { + args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ + args[c][2] = bld_base->uint_bld.one; /* DONE bit */ + } else if (args[c][0] == bld_base->uint_bld.zero) + continue; /* unnecessary NULL export */ + + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + ctx->voidt, args[c], 9, 0); + } + } else { + LLVMValueRef args[9]; + + /* Export */ + si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index, + args); + if (is_last) { + args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ + args[2] = bld_base->uint_bld.one; /* DONE bit */ + } else if (args[0] == bld_base->uint_bld.zero) + return; /* unnecessary NULL export */ + + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + ctx->voidt, args, 9, 0); } - lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - args, 9, 0); } static void si_export_null(struct lp_build_tgsi_context *bld_base) { + struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *base = &bld_base->base; struct lp_build_context *uint = &bld_base->uint_bld; LLVMValueRef args[9]; @@ -2203,34 +2468,57 @@ static void si_export_null(struct lp_build_tgsi_context *bld_base) args[8] = uint->undef; /* A */ lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", - LLVMVoidTypeInContext(base->gallivm->context), - args, 9, 0); + ctx->voidt, args, 9, 0); } -static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) +static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) { - struct si_shader_context * si_shader_ctx = si_shader_context(bld_base); - struct si_shader * shader = si_shader_ctx->shader; - struct lp_build_context * base = &bld_base->base; + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; struct tgsi_shader_info *info = &shader->selector->info; LLVMBuilderRef builder = base->gallivm->builder; LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; int last_color_export = -1; int i; - /* If there are no outputs, add a dummy export. */ - if (!info->num_outputs) { - si_export_null(bld_base); - return; - } - /* Determine the last export. If MRTZ is present, it's always last. * Otherwise, find the last color export. */ - if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) - for (i = 0; i < info->num_outputs; i++) - if (info->output_semantic_name[i] == TGSI_SEMANTIC_COLOR) + if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) { + unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format; + + /* Don't export NULL and return if alpha-test is enabled. */ + if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS && + shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER && + (spi_format & 0xf) == 0) + spi_format |= V_028714_SPI_SHADER_32_AR; + + for (i = 0; i < info->num_outputs; i++) { + unsigned index = info->output_semantic_index[i]; + + if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR) + continue; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (shader->key.ps.epilog.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1)) + last_color_export = i; + continue; + } + + if ((spi_format >> (index * 4)) & 0xf) last_color_export = i; + } + + /* If there are no outputs, export NULL. */ + if (last_color_export == -1) { + si_export_null(bld_base); + return; + } + } for (i = 0; i < info->num_outputs; i++) { unsigned semantic_name = info->output_semantic_name[i]; @@ -2242,22 +2530,23 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) switch (semantic_name) { case TGSI_SEMANTIC_POSITION: depth = LLVMBuildLoad(builder, - si_shader_ctx->radeon_bld.soa.outputs[i][2], ""); + ctx->radeon_bld.soa.outputs[i][2], ""); break; case TGSI_SEMANTIC_STENCIL: stencil = LLVMBuildLoad(builder, - si_shader_ctx->radeon_bld.soa.outputs[i][1], ""); + ctx->radeon_bld.soa.outputs[i][1], ""); break; case TGSI_SEMANTIC_SAMPLEMASK: samplemask = LLVMBuildLoad(builder, - si_shader_ctx->radeon_bld.soa.outputs[i][0], ""); + ctx->radeon_bld.soa.outputs[i][0], ""); break; case TGSI_SEMANTIC_COLOR: for (j = 0; j < 4; j++) color[j] = LLVMBuildLoad(builder, - si_shader_ctx->radeon_bld.soa.outputs[i][j], ""); + ctx->radeon_bld.soa.outputs[i][j], ""); si_export_mrt_color(bld_base, color, semantic_index, + SI_PARAM_SAMPLE_COVERAGE, last_color_export == i); break; default: @@ -2271,9 +2560,103 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) si_export_mrt_z(bld_base, depth, stencil, samplemask); } -static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data); +/** + * Return PS outputs in this order: + * + * v[0:3] = color0.xyzw + * v[4:7] = color1.xyzw + * ... + * vN+0 = Depth + * vN+1 = Stencil + * vN+2 = SampleMask + * vN+3 = SampleMaskIn (used for OpenGL smoothing) + * + * The alpha-ref SGPR is returned via its original location. + */ +static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][2], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][1], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][0], ""); + break; + default: + fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", + semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue(builder, ret, + bitcast(bld_base, TGSI_TYPE_SIGNED, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); + + ctx->return_value = ret; +} + +static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data); static bool tgsi_is_array_sampler(unsigned target) { @@ -2286,20 +2669,20 @@ static bool tgsi_is_array_sampler(unsigned target) target == TGSI_TEXTURE_2D_ARRAY_MSAA; } -static void set_tex_fetch_args(struct gallivm_state *gallivm, +static void set_tex_fetch_args(struct si_shader_context *ctx, struct lp_build_emit_data *emit_data, unsigned opcode, unsigned target, LLVMValueRef res_ptr, LLVMValueRef samp_ptr, LLVMValueRef *param, unsigned count, unsigned dmask) { + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; unsigned num_args; unsigned is_rect = target == TGSI_TEXTURE_RECT; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); /* Pad to power of two vector */ while (count < util_next_power_of_two(count)) - param[count++] = LLVMGetUndef(i32); + param[count++] = LLVMGetUndef(ctx->i32); /* Texture coordinates. */ if (count > 1) @@ -2312,10 +2695,9 @@ static void set_tex_fetch_args(struct gallivm_state *gallivm, num_args = 2; if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ) - emit_data->dst_type = LLVMVectorType(i32, 4); + emit_data->dst_type = ctx->v4i32; else { - emit_data->dst_type = LLVMVectorType( - LLVMFloatTypeInContext(gallivm->context), 4); + emit_data->dst_type = ctx->v4f32; emit_data->args[num_args++] = samp_ptr; } @@ -2335,14 +2717,66 @@ static void set_tex_fetch_args(struct gallivm_state *gallivm, static const struct lp_build_tgsi_action tex_action; -static void tex_fetch_ptrs( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data, +enum desc_type { + DESC_IMAGE, + DESC_FMASK, + DESC_SAMPLER +}; + +static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) +{ + return LLVMPointerType(LLVMArrayType(elem_type, num_elements), + CONST_ADDR_SPACE); +} + +/** + * Load an image view, fmask view. or sampler state descriptor. + */ +static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx, + LLVMValueRef list, LLVMValueRef index, + enum desc_type type) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMBuilderRef builder = gallivm->builder; + + switch (type) { + case DESC_IMAGE: + /* The image is at [0:7]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); + break; + case DESC_FMASK: + /* The FMASK is at [8:15]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); + index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), ""); + break; + case DESC_SAMPLER: + /* The sampler state is at [12:15]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); + index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), ""); + list = LLVMBuildPointerCast(builder, list, + const_array(ctx->v4i32, 0), ""); + break; + } + + return build_indexed_load_const(ctx, list, index); +} + +static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, + LLVMValueRef index, enum desc_type type) +{ + LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + + return get_sampler_desc_custom(ctx, list, index, type); +} + +static void tex_fetch_ptrs( + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data, LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); - struct gallivm_state *gallivm = bld_base->base.gallivm; - const struct tgsi_full_instruction * inst = emit_data->inst; + struct si_shader_context *ctx = si_shader_context(bld_base); + const struct tgsi_full_instruction *inst = emit_data->inst; unsigned target = inst->Texture.Texture; unsigned sampler_src; unsigned sampler_index; @@ -2354,37 +2788,33 @@ static void tex_fetch_ptrs( const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src]; LLVMValueRef ind_index; - ind_index = get_indirect_index(si_shader_ctx, ®->Indirect, reg->Register.Index); - - *res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS); - *res_ptr = build_indexed_load_const(si_shader_ctx, *res_ptr, ind_index); + ind_index = get_indirect_index(ctx, ®->Indirect, reg->Register.Index); - *samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES); - *samp_ptr = build_indexed_load_const(si_shader_ctx, *samp_ptr, ind_index); + *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE); if (target == TGSI_TEXTURE_2D_MSAA || target == TGSI_TEXTURE_2D_ARRAY_MSAA) { - ind_index = LLVMBuildAdd(gallivm->builder, ind_index, - lp_build_const_int32(gallivm, - SI_FMASK_TEX_OFFSET), ""); - *fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS); - *fmask_ptr = build_indexed_load_const(si_shader_ctx, *fmask_ptr, ind_index); + *samp_ptr = NULL; + *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK); + } else { + *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER); + *fmask_ptr = NULL; } } else { - *res_ptr = si_shader_ctx->sampler_views[sampler_index]; - *samp_ptr = si_shader_ctx->sampler_states[sampler_index]; - *fmask_ptr = si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + sampler_index]; + *res_ptr = ctx->sampler_views[sampler_index]; + *samp_ptr = ctx->sampler_states[sampler_index]; + *fmask_ptr = ctx->fmasks[sampler_index]; } } static void tex_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; - const struct tgsi_full_instruction * inst = emit_data->inst; + const struct tgsi_full_instruction *inst = emit_data->inst; unsigned opcode = inst->Instruction.Opcode; unsigned target = inst->Texture.Texture; LLVMValueRef coords[5], derivs[6]; @@ -2396,21 +2826,18 @@ static void tex_fetch_args( unsigned num_deriv_channels = 0; bool has_offset = inst->Texture.NumOffsets > 0; LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); unsigned dmask = 0xf; tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); if (opcode == TGSI_OPCODE_TXQ) { if (target == TGSI_TEXTURE_BUFFER) { - LLVMTypeRef v8i32 = LLVMVectorType(i32, 8); - /* Read the size from the buffer descriptor directly. */ - LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, v8i32, ""); + LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); LLVMValueRef size = LLVMBuildExtractElement(builder, res, lp_build_const_int32(gallivm, 6), ""); - if (si_shader_ctx->screen->b.chip_class >= VI) { + if (ctx->screen->b.chip_class >= VI) { /* On VI, the descriptor contains the size in bytes, * but TXQ must return the size in elements. * The stride is always non-zero for resources using TXQ. @@ -2433,24 +2860,21 @@ static void tex_fetch_args( /* Textures - set the mip level. */ address[count++] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X); - set_tex_fetch_args(gallivm, emit_data, opcode, target, res_ptr, + set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr, NULL, address, count, 0xf); return; } if (target == TGSI_TEXTURE_BUFFER) { - LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128); - LLVMTypeRef v2i128 = LLVMVectorType(i128, 2); - LLVMTypeRef i8 = LLVMInt8TypeInContext(gallivm->context); - LLVMTypeRef v16i8 = LLVMVectorType(i8, 16); + LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); /* Bitcast and truncate v8i32 to v16i8. */ LLVMValueRef res = res_ptr; res = LLVMBuildBitCast(gallivm->builder, res, v2i128, ""); res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, ""); - res = LLVMBuildBitCast(gallivm->builder, res, v16i8, ""); + res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, ""); - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); + emit_data->dst_type = ctx->v4f32; emit_data->args[0] = res; emit_data->args[1] = bld_base->uint_bld.zero; emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); @@ -2587,7 +3011,7 @@ static void tex_fetch_args( for (chan = 0; chan < count; chan++ ) { address[chan] = LLVMBuildBitCast(gallivm->builder, - address[chan], i32, ""); + address[chan], ctx->i32, ""); } /* Adjust the sample index according to FMASK. @@ -2624,14 +3048,14 @@ static void tex_fetch_args( inst.Texture.Texture = target; txf_emit_data.inst = &inst; txf_emit_data.chan = 0; - set_tex_fetch_args(gallivm, &txf_emit_data, TGSI_OPCODE_TXF, + set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF, target, fmask_ptr, NULL, txf_address, txf_count, 0xf); build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data); /* Initialize some constants. */ - LLVMValueRef four = LLVMConstInt(uint_bld->elem_type, 4, 0); - LLVMValueRef F = LLVMConstInt(uint_bld->elem_type, 0xF, 0); + LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0); + LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0); /* Apply the formula. */ LLVMValueRef fmask = @@ -2655,7 +3079,7 @@ static void tex_fetch_args( */ LLVMValueRef fmask_desc = LLVMBuildBitCast(gallivm->builder, fmask_ptr, - LLVMVectorType(uint_bld->elem_type, 8), ""); + ctx->v8i32, ""); LLVMValueRef fmask_word1 = LLVMBuildExtractElement(gallivm->builder, fmask_desc, @@ -2676,7 +3100,7 @@ static void tex_fetch_args( if (inst->Texture.NumOffsets) { struct lp_build_context *uint_bld = &bld_base->uint_bld; struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); - const struct tgsi_texture_offset * off = inst->TexOffsets; + const struct tgsi_texture_offset *off = inst->TexOffsets; assert(inst->Texture.NumOffsets == 1); @@ -2735,15 +3159,15 @@ static void tex_fetch_args( dmask = 1 << gather_comp; } - set_tex_fetch_args(gallivm, emit_data, opcode, target, res_ptr, + set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr, samp_ptr, address, count, dmask); } -static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) +static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { - struct lp_build_context * base = &bld_base->base; + struct lp_build_context *base = &bld_base->base; unsigned opcode = emit_data->inst->Instruction.Opcode; unsigned target = emit_data->inst->Texture.Texture; char intr_name[127]; @@ -2844,14 +3268,13 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action, } static void si_llvm_emit_txqs( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); - LLVMTypeRef v8i32 = LLVMVectorType(i32, 8); LLVMValueRef res, samples; LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; @@ -2859,7 +3282,7 @@ static void si_llvm_emit_txqs( /* Read the samples from the descriptor directly. */ - res = LLVMBuildBitCast(builder, res_ptr, v8i32, ""); + res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); samples = LLVMBuildExtractElement( builder, res, lp_build_const_int32(gallivm, 3), ""); @@ -2903,30 +3326,26 @@ static void si_llvm_emit_txqs( #define TID_MASK_LEFT 0xfffffffe static void si_llvm_emit_ddxy( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - struct lp_build_context * base = &bld_base->base; const struct tgsi_full_instruction *inst = emit_data->inst; unsigned opcode = inst->Instruction.Opcode; LLVMValueRef indices[2]; LLVMValueRef store_ptr, load_ptr0, load_ptr1; LLVMValueRef tl, trbl, result[4]; - LLVMTypeRef i32; unsigned swizzle[4]; unsigned c; int idx; unsigned mask; - i32 = LLVMInt32TypeInContext(gallivm->context); - indices[0] = bld_base->uint_bld.zero; - indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32, + indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32, NULL, 0, LLVMReadNoneAttribute); - store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); if (opcode == TGSI_OPCODE_DDX_FINE) @@ -2938,14 +3357,14 @@ static void si_llvm_emit_ddxy( indices[1] = LLVMBuildAnd(gallivm->builder, indices[1], lp_build_const_int32(gallivm, mask), ""); - load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); /* for DDX we want to next X pixel, DDY next Y pixel. */ idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; indices[1] = LLVMBuildAdd(gallivm->builder, indices[1], lp_build_const_int32(gallivm, idx), ""); - load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); for (c = 0; c < 4; ++c) { @@ -2964,14 +3383,14 @@ static void si_llvm_emit_ddxy( LLVMBuildStore(gallivm->builder, LLVMBuildBitCast(gallivm->builder, lp_build_emit_fetch(bld_base, inst, 0, c), - i32, ""), + ctx->i32, ""), store_ptr); tl = LLVMBuildLoad(gallivm->builder, load_ptr0, ""); - tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, ""); + tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, ""); trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, ""); - trbl = LLVMBuildBitCast(gallivm->builder, trbl, base->elem_type, ""); + trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, ""); result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, ""); } @@ -2988,21 +3407,17 @@ static LLVMValueRef si_llvm_emit_ddxy_interp( struct lp_build_tgsi_context *bld_base, LLVMValueRef interp_ij) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - struct lp_build_context *base = &bld_base->base; LLVMValueRef indices[2]; LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2; LLVMValueRef tl, tr, bl, result[4]; - LLVMTypeRef i32; unsigned c; - i32 = LLVMInt32TypeInContext(gallivm->context); - indices[0] = bld_base->uint_bld.zero; - indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32, + indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32, NULL, 0, LLVMReadNoneAttribute); - store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); temp = LLVMBuildAnd(gallivm->builder, indices[1], @@ -3012,21 +3427,21 @@ static LLVMValueRef si_llvm_emit_ddxy_interp( lp_build_const_int32(gallivm, TID_MASK_TOP), ""); indices[1] = temp; - load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); indices[1] = temp2; - load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); indices[1] = LLVMBuildAdd(gallivm->builder, temp, lp_build_const_int32(gallivm, 1), ""); - load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); indices[1] = LLVMBuildAdd(gallivm->builder, temp2, lp_build_const_int32(gallivm, 2), ""); - load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds, + load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds, indices, 2, ""); for (c = 0; c < 2; ++c) { @@ -3040,18 +3455,18 @@ static LLVMValueRef si_llvm_emit_ddxy_interp( store_ptr); tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, ""); - tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, ""); + tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, ""); tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, ""); - tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, ""); + tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, ""); result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, ""); tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, ""); - tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, ""); + tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, ""); bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, ""); - bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, ""); + bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, ""); result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, ""); } @@ -3063,7 +3478,7 @@ static void interp_fetch_args( struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; const struct tgsi_full_instruction *inst = emit_data->inst; @@ -3087,9 +3502,8 @@ static void interp_fetch_args( sample_id = lp_build_emit_fetch(bld_base, emit_data->inst, 1, TGSI_CHAN_X); sample_id = LLVMBuildBitCast(gallivm->builder, sample_id, - LLVMInt32TypeInContext(gallivm->context), - ""); - sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id); + ctx->i32, ""); + sample_position = load_sample_position(&ctx->radeon_bld, sample_id); emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder, sample_position, @@ -3108,23 +3522,22 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); - struct si_shader *shader = si_shader_ctx->shader; + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef interp_param; const struct tgsi_full_instruction *inst = emit_data->inst; const char *intr_name; - int input_index; + int input_index = inst->Src[0].Register.Index; int chan; int i; LLVMValueRef attr_number; - LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context); - LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK); + LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK); int interp_param_idx; + unsigned interp = shader->selector->info.input_interpolate[input_index]; unsigned location; assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); - input_index = inst->Src[0].Register.Index; if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) @@ -3132,17 +3545,15 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, else location = TGSI_INTERPOLATE_LOC_CENTROID; - interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index], - location); + interp_param_idx = lookup_interp_param_index(interp, location); if (interp_param_idx == -1) return; else if (interp_param_idx) - interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx); + interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx); else interp_param = NULL; - attr_number = lp_build_const_int32(gallivm, - shader->ps_input_param_offset[input_index]); + attr_number = lp_build_const_int32(gallivm, input_index); if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { @@ -3169,7 +3580,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, LLVMValueRef temp1, temp2; interp_el = LLVMBuildBitCast(gallivm->builder, interp_el, - LLVMFloatTypeInContext(gallivm->context), ""); + ctx->f32, ""); temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], ""); @@ -3180,8 +3591,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, ""); ij_out[i] = LLVMBuildBitCast(gallivm->builder, - temp2, - LLVMIntTypeInContext(gallivm->context, 32), ""); + temp2, ctx->i32, ""); } interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2); } @@ -3202,7 +3612,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, emit_data->output[chan] = lp_build_intrinsic(gallivm->builder, intr_name, - input_type, args, args[3] ? 4 : 3, + ctx->f32, args, args[3] ? 4 : 3, LLVMReadNoneAttribute | LLVMNoUnwindAttribute); } } @@ -3226,13 +3636,12 @@ static void si_llvm_emit_vertex( struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct lp_build_context *uint = &bld_base->uint_bld; - struct si_shader *shader = si_shader_ctx->shader; + struct si_shader *shader = ctx->shader; struct tgsi_shader_info *info = &shader->selector->info; struct gallivm_state *gallivm = bld_base->base.gallivm; - LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); - LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS2VS_OFFSET); LLVMValueRef gs_next_vertex; LLVMValueRef can_emit, kill; @@ -3245,7 +3654,7 @@ static void si_llvm_emit_vertex( /* Write vertex attribute values to GSVS ring */ gs_next_vertex = LLVMBuildLoad(gallivm->builder, - si_shader_ctx->gs_next_vertex[stream], + ctx->gs_next_vertex[stream], ""); /* If this thread has already emitted the declared maximum number of @@ -3261,11 +3670,11 @@ static void si_llvm_emit_vertex( lp_build_const_float(gallivm, -1.0f)); lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", - LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0); + ctx->voidt, &kill, 1, 0); for (i = 0; i < info->num_outputs; i++) { LLVMValueRef *out_ptr = - si_shader_ctx->radeon_bld.soa.outputs[i]; + ctx->radeon_bld.soa.outputs[i]; for (chan = 0; chan < 4; chan++) { LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); @@ -3276,10 +3685,10 @@ static void si_llvm_emit_vertex( voffset = lp_build_add(uint, voffset, gs_next_vertex); voffset = lp_build_mul_imm(uint, voffset, 4); - out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, ""); + out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); - build_tbuffer_store(si_shader_ctx, - si_shader_ctx->gsvs_ring[stream], + build_tbuffer_store(ctx, + ctx->gsvs_ring[stream], out_val, 1, voffset, soffset, 0, V_008F0C_BUF_DATA_FORMAT_32, @@ -3290,14 +3699,13 @@ static void si_llvm_emit_vertex( gs_next_vertex = lp_build_add(uint, gs_next_vertex, lp_build_const_int32(gallivm, 1)); - LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]); + LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]); /* Signal vertex emission */ args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8)); - args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); + args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", - LLVMVoidTypeInContext(gallivm->context), args, 2, - LLVMNoUnwindAttribute); + ctx->voidt, args, 2, LLVMNoUnwindAttribute); } /* Cut one primitive from the geometry shader */ @@ -3306,7 +3714,7 @@ static void si_llvm_emit_primitive( struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { - struct si_shader_context *si_shader_ctx = si_shader_context(bld_base); + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef args[2]; unsigned stream; @@ -3314,21 +3722,22 @@ static void si_llvm_emit_primitive( /* Signal primitive cut */ stream = si_llvm_get_stream(bld_base, emit_data); args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8)); - args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); + args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID); lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", - LLVMVoidTypeInContext(gallivm->context), args, 2, - LLVMNoUnwindAttribute); + ctx->voidt, args, 2, LLVMNoUnwindAttribute); } static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; - lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local", - LLVMVoidTypeInContext(gallivm->context), NULL, 0, - LLVMNoUnwindAttribute); + lp_build_intrinsic(gallivm->builder, + HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier" + : "llvm.AMDGPU.barrier.local", + ctx->voidt, NULL, 0, LLVMNoUnwindAttribute); } static const struct lp_build_tgsi_action tex_action = { @@ -3341,25 +3750,43 @@ static const struct lp_build_tgsi_action interp_action = { .emit = build_interp_intrinsic, }; -static void create_meta_data(struct si_shader_context *si_shader_ctx) +static void si_create_function(struct si_shader_context *ctx, + LLVMTypeRef *returns, unsigned num_returns, + LLVMTypeRef *params, unsigned num_params, + int last_array_pointer, int last_sgpr) { - struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; + int i; + + radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns, + params, num_params); + radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type); + ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type); + + for (i = 0; i <= last_sgpr; ++i) { + LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i); + + /* We tell llvm that array inputs are passed by value to allow Sinking pass + * to move load. Inputs are constant so this is fine. */ + if (i <= last_array_pointer) + LLVMAddAttribute(P, LLVMByValAttribute); + else + LLVMAddAttribute(P, LLVMInRegAttribute); + } +} + +static void create_meta_data(struct si_shader_context *ctx) +{ + struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm; LLVMValueRef args[3]; args[0] = LLVMMDStringInContext(gallivm->context, "const", 5); args[1] = 0; args[2] = lp_build_const_int32(gallivm, 1); - si_shader_ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3); -} - -static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) -{ - return LLVMPointerType(LLVMArrayType(elem_type, num_elements), - CONST_ADDR_SPACE); + ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3); } -static void declare_streamout_params(struct si_shader_context *si_shader_ctx, +static void declare_streamout_params(struct si_shader_context *ctx, struct pipe_stream_output_info *so, LLVMTypeRef *params, LLVMTypeRef i32, unsigned *num_params) @@ -3368,149 +3795,243 @@ static void declare_streamout_params(struct si_shader_context *si_shader_ctx, /* Streamout SGPRs. */ if (so->num_outputs) { - params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32; - params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32; + params[ctx->param_streamout_config = (*num_params)++] = i32; + params[ctx->param_streamout_write_index = (*num_params)++] = i32; } /* A streamout buffer offset is loaded if the stride is non-zero. */ for (i = 0; i < 4; i++) { if (!so->stride[i]) continue; - params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32; + params[ctx->param_streamout_offset[i] = (*num_params)++] = i32; + } +} + +static unsigned llvm_get_type_size(LLVMTypeRef type) +{ + LLVMTypeKind kind = LLVMGetTypeKind(type); + + switch (kind) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(type) / 8; + case LLVMFloatTypeKind: + return 4; + case LLVMPointerTypeKind: + return 8; + case LLVMVectorTypeKind: + return LLVMGetVectorSize(type) * + llvm_get_type_size(LLVMGetElementType(type)); + default: + assert(0); + return 0; } } -static void create_function(struct si_shader_context *si_shader_ctx) +static void declare_tess_lds(struct si_shader_context *ctx) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type; + + /* This is the upper bound, maximum is 32 inputs times 32 vertices */ + unsigned vertex_data_dw_size = 32*32*4; + unsigned patch_data_dw_size = 32*4; + /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ + unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; + unsigned lds_dwords = patch_dw_size; + + /* The actual size is computed outside of the shader to reduce + * the number of shader variants. */ + ctx->lds = + LLVMAddGlobalInAddressSpace(gallivm->module, + LLVMArrayType(i32, lds_dwords), + "tess_lds", + LOCAL_ADDR_SPACE); +} + +static void create_function(struct si_shader_context *ctx) { - struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; - struct si_shader *shader = si_shader_ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS], f32, i8, i32, v2i32, v3i32, v16i8, v4i32, v8i32; - unsigned i, last_array_pointer, last_sgpr, num_params; - - i8 = LLVMInt8TypeInContext(gallivm->context); - i32 = LLVMInt32TypeInContext(gallivm->context); - f32 = LLVMFloatTypeInContext(gallivm->context); - v2i32 = LLVMVectorType(i32, 2); - v3i32 = LLVMVectorType(i32, 3); - v4i32 = LLVMVectorType(i32, 4); - v8i32 = LLVMVectorType(i32, 8); - v16i8 = LLVMVectorType(i8, 16); - - params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS); - params[SI_PARAM_CONST_BUFFERS] = const_array(v16i8, SI_NUM_CONST_BUFFERS); - params[SI_PARAM_SAMPLER_STATES] = const_array(v4i32, SI_NUM_SAMPLER_STATES); - params[SI_PARAM_SAMPLER_VIEWS] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS); - last_array_pointer = SI_PARAM_SAMPLER_VIEWS; - - switch (si_shader_ctx->type) { + struct si_shader *shader = ctx->shader; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; + LLVMTypeRef returns[16+32*4]; + unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs; + unsigned num_returns = 0; + + v3i32 = LLVMVectorType(ctx->i32, 3); + + params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); + params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS); + params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS); + params[SI_PARAM_UNUSED] = LLVMPointerType(ctx->i32, CONST_ADDR_SPACE); + last_array_pointer = SI_PARAM_UNUSED; + + switch (ctx->type) { case TGSI_PROCESSOR_VERTEX: - params[SI_PARAM_VERTEX_BUFFERS] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS); + params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS); last_array_pointer = SI_PARAM_VERTEX_BUFFERS; - params[SI_PARAM_BASE_VERTEX] = i32; - params[SI_PARAM_START_INSTANCE] = i32; + params[SI_PARAM_BASE_VERTEX] = ctx->i32; + params[SI_PARAM_START_INSTANCE] = ctx->i32; num_params = SI_PARAM_START_INSTANCE+1; if (shader->key.vs.as_es) { - params[si_shader_ctx->param_es2gs_offset = num_params++] = i32; + params[ctx->param_es2gs_offset = num_params++] = ctx->i32; } else if (shader->key.vs.as_ls) { - params[SI_PARAM_LS_OUT_LAYOUT] = i32; + params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32; num_params = SI_PARAM_LS_OUT_LAYOUT+1; } else { - if (shader->is_gs_copy_shader) { + if (ctx->is_gs_copy_shader) { last_array_pointer = SI_PARAM_CONST_BUFFERS; num_params = SI_PARAM_CONST_BUFFERS+1; } else { - params[SI_PARAM_VS_STATE_BITS] = i32; + params[SI_PARAM_VS_STATE_BITS] = ctx->i32; num_params = SI_PARAM_VS_STATE_BITS+1; } /* The locations of the other parameters are assigned dynamically. */ - declare_streamout_params(si_shader_ctx, &shader->selector->so, - params, i32, &num_params); + declare_streamout_params(ctx, &shader->selector->so, + params, ctx->i32, &num_params); } last_sgpr = num_params-1; /* VGPRs */ - params[si_shader_ctx->param_vertex_id = num_params++] = i32; - params[si_shader_ctx->param_rel_auto_id = num_params++] = i32; - params[si_shader_ctx->param_vs_prim_id = num_params++] = i32; - params[si_shader_ctx->param_instance_id = num_params++] = i32; + params[ctx->param_vertex_id = num_params++] = ctx->i32; + params[ctx->param_rel_auto_id = num_params++] = ctx->i32; + params[ctx->param_vs_prim_id = num_params++] = ctx->i32; + params[ctx->param_instance_id = num_params++] = ctx->i32; + + if (!ctx->is_monolithic && + !ctx->is_gs_copy_shader) { + /* Vertex load indices. */ + ctx->param_vertex_index0 = num_params; + + for (i = 0; i < shader->selector->info.num_inputs; i++) + params[num_params++] = ctx->i32; + + /* PrimitiveID output. */ + if (!shader->key.vs.as_es && !shader->key.vs.as_ls) + for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) + returns[num_returns++] = ctx->f32; + } break; case TGSI_PROCESSOR_TESS_CTRL: - params[SI_PARAM_TCS_OUT_OFFSETS] = i32; - params[SI_PARAM_TCS_OUT_LAYOUT] = i32; - params[SI_PARAM_TCS_IN_LAYOUT] = i32; - params[SI_PARAM_TESS_FACTOR_OFFSET] = i32; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; + params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32; last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; /* VGPRs */ - params[SI_PARAM_PATCH_ID] = i32; - params[SI_PARAM_REL_IDS] = i32; + params[SI_PARAM_PATCH_ID] = ctx->i32; + params[SI_PARAM_REL_IDS] = ctx->i32; num_params = SI_PARAM_REL_IDS+1; + + if (!ctx->is_monolithic) { + /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */ + for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + + for (i = 0; i < 3; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } break; case TGSI_PROCESSOR_TESS_EVAL: - params[SI_PARAM_TCS_OUT_OFFSETS] = i32; - params[SI_PARAM_TCS_OUT_LAYOUT] = i32; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; num_params = SI_PARAM_TCS_OUT_LAYOUT+1; if (shader->key.tes.as_es) { - params[si_shader_ctx->param_es2gs_offset = num_params++] = i32; + params[ctx->param_es2gs_offset = num_params++] = ctx->i32; } else { - declare_streamout_params(si_shader_ctx, &shader->selector->so, - params, i32, &num_params); + declare_streamout_params(ctx, &shader->selector->so, + params, ctx->i32, &num_params); } last_sgpr = num_params - 1; /* VGPRs */ - params[si_shader_ctx->param_tes_u = num_params++] = f32; - params[si_shader_ctx->param_tes_v = num_params++] = f32; - params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32; - params[si_shader_ctx->param_tes_patch_id = num_params++] = i32; + params[ctx->param_tes_u = num_params++] = ctx->f32; + params[ctx->param_tes_v = num_params++] = ctx->f32; + params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32; + params[ctx->param_tes_patch_id = num_params++] = ctx->i32; + + /* PrimitiveID output. */ + if (!ctx->is_monolithic && !shader->key.tes.as_es) + for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) + returns[num_returns++] = ctx->f32; break; case TGSI_PROCESSOR_GEOMETRY: - params[SI_PARAM_GS2VS_OFFSET] = i32; - params[SI_PARAM_GS_WAVE_ID] = i32; + params[SI_PARAM_GS2VS_OFFSET] = ctx->i32; + params[SI_PARAM_GS_WAVE_ID] = ctx->i32; last_sgpr = SI_PARAM_GS_WAVE_ID; /* VGPRs */ - params[SI_PARAM_VTX0_OFFSET] = i32; - params[SI_PARAM_VTX1_OFFSET] = i32; - params[SI_PARAM_PRIMITIVE_ID] = i32; - params[SI_PARAM_VTX2_OFFSET] = i32; - params[SI_PARAM_VTX3_OFFSET] = i32; - params[SI_PARAM_VTX4_OFFSET] = i32; - params[SI_PARAM_VTX5_OFFSET] = i32; - params[SI_PARAM_GS_INSTANCE_ID] = i32; + params[SI_PARAM_VTX0_OFFSET] = ctx->i32; + params[SI_PARAM_VTX1_OFFSET] = ctx->i32; + params[SI_PARAM_PRIMITIVE_ID] = ctx->i32; + params[SI_PARAM_VTX2_OFFSET] = ctx->i32; + params[SI_PARAM_VTX3_OFFSET] = ctx->i32; + params[SI_PARAM_VTX4_OFFSET] = ctx->i32; + params[SI_PARAM_VTX5_OFFSET] = ctx->i32; + params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32; num_params = SI_PARAM_GS_INSTANCE_ID+1; break; case TGSI_PROCESSOR_FRAGMENT: - params[SI_PARAM_ALPHA_REF] = f32; - params[SI_PARAM_PS_STATE_BITS] = i32; - params[SI_PARAM_PRIM_MASK] = i32; + params[SI_PARAM_ALPHA_REF] = ctx->f32; + params[SI_PARAM_PRIM_MASK] = ctx->i32; last_sgpr = SI_PARAM_PRIM_MASK; - params[SI_PARAM_PERSP_SAMPLE] = v2i32; - params[SI_PARAM_PERSP_CENTER] = v2i32; - params[SI_PARAM_PERSP_CENTROID] = v2i32; + params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTER] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32; params[SI_PARAM_PERSP_PULL_MODEL] = v3i32; - params[SI_PARAM_LINEAR_SAMPLE] = v2i32; - params[SI_PARAM_LINEAR_CENTER] = v2i32; - params[SI_PARAM_LINEAR_CENTROID] = v2i32; - params[SI_PARAM_LINE_STIPPLE_TEX] = f32; - params[SI_PARAM_POS_X_FLOAT] = f32; - params[SI_PARAM_POS_Y_FLOAT] = f32; - params[SI_PARAM_POS_Z_FLOAT] = f32; - params[SI_PARAM_POS_W_FLOAT] = f32; - params[SI_PARAM_FRONT_FACE] = f32; - params[SI_PARAM_ANCILLARY] = i32; - params[SI_PARAM_SAMPLE_COVERAGE] = f32; - params[SI_PARAM_POS_FIXED_PT] = f32; + params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32; + params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32; + params[SI_PARAM_POS_X_FLOAT] = ctx->f32; + params[SI_PARAM_POS_Y_FLOAT] = ctx->f32; + params[SI_PARAM_POS_Z_FLOAT] = ctx->f32; + params[SI_PARAM_POS_W_FLOAT] = ctx->f32; + params[SI_PARAM_FRONT_FACE] = ctx->i32; + params[SI_PARAM_ANCILLARY] = ctx->i32; + params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; + params[SI_PARAM_POS_FIXED_PT] = ctx->i32; num_params = SI_PARAM_POS_FIXED_PT+1; + + if (!ctx->is_monolithic) { + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = + util_bitcount(shader->selector->info.colors_read); + + assert(num_params + num_color_elements <= ARRAY_SIZE(params)); + for (i = 0; i < num_color_elements; i++) + params[num_params++] = ctx->f32; + } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->i32; + for (; i < num_returns; i++) + returns[i] = ctx->f32; + } break; default: @@ -3519,23 +4040,37 @@ static void create_function(struct si_shader_context *si_shader_ctx) } assert(num_params <= Elements(params)); - radeon_llvm_create_func(&si_shader_ctx->radeon_bld, params, num_params); - radeon_llvm_shader_type(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->type); - if (shader->dx10_clamp_mode) - LLVMAddTargetDependentFunctionAttr(si_shader_ctx->radeon_bld.main_fn, - "enable-no-nans-fp-math", "true"); + si_create_function(ctx, returns, num_returns, params, + num_params, last_array_pointer, last_sgpr); + + /* Reserve register locations for VGPR inputs the PS prolog may need. */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT && + !ctx->is_monolithic) { + radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, + "InitialPSInputAddr", + S_0286D0_PERSP_SAMPLE_ENA(1) | + S_0286D0_PERSP_CENTER_ENA(1) | + S_0286D0_PERSP_CENTROID_ENA(1) | + S_0286D0_LINEAR_SAMPLE_ENA(1) | + S_0286D0_LINEAR_CENTER_ENA(1) | + S_0286D0_LINEAR_CENTROID_ENA(1) | + S_0286D0_FRONT_FACE_ENA(1) | + S_0286D0_POS_FIXED_PT_ENA(1)); + } - for (i = 0; i <= last_sgpr; ++i) { - LLVMValueRef P = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, i); + shader->num_input_sgprs = 0; + shader->num_input_vgprs = 0; - /* We tell llvm that array inputs are passed by value to allow Sinking pass - * to move load. Inputs are constant so this is fine. */ - if (i <= last_array_pointer) - LLVMAddAttribute(P, LLVMByValAttribute); - else - LLVMAddAttribute(P, LLVMInRegAttribute); - } + for (i = 0; i <= last_sgpr; ++i) + shader->num_input_sgprs += llvm_get_type_size(params[i]) / 4; + + /* Unused fragment shader inputs are eliminated by the compiler, + * so we don't know yet how many there will be. + */ + if (ctx->type != TGSI_PROCESSOR_FRAGMENT) + for (; i < num_params; ++i) + shader->num_input_vgprs += llvm_get_type_size(params[i]) / 4; if (bld_base->info && (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || @@ -3544,39 +4079,25 @@ static void create_function(struct si_shader_context *si_shader_ctx) bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 || bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 || bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0)) - si_shader_ctx->lds = + ctx->lds = LLVMAddGlobalInAddressSpace(gallivm->module, - LLVMArrayType(i32, 64), + LLVMArrayType(ctx->i32, 64), "ddxy_lds", LOCAL_ADDR_SPACE); - if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) || - si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL || - si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) { - /* This is the upper bound, maximum is 32 inputs times 32 vertices */ - unsigned vertex_data_dw_size = 32*32*4; - unsigned patch_data_dw_size = 32*4; - /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */ - unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size; - unsigned lds_dwords = patch_dw_size; - - /* The actual size is computed outside of the shader to reduce - * the number of shader variants. */ - si_shader_ctx->lds = - LLVMAddGlobalInAddressSpace(gallivm->module, - LLVMArrayType(i32, lds_dwords), - "tess_lds", - LOCAL_ADDR_SPACE); - } + if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) || + ctx->type == TGSI_PROCESSOR_TESS_CTRL || + ctx->type == TGSI_PROCESSOR_TESS_EVAL) + declare_tess_lds(ctx); } -static void preload_constants(struct si_shader_context *si_shader_ctx) +static void preload_constants(struct si_shader_context *ctx) { - struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; - struct gallivm_state * gallivm = bld_base->base.gallivm; - const struct tgsi_shader_info * info = bld_base->info; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_shader_info *info = bld_base->info; unsigned buf; - LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); + LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS); for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) { unsigned i, num_const = info->const_file_max[buf] + 1; @@ -3585,84 +4106,76 @@ static void preload_constants(struct si_shader_context *si_shader_ctx) continue; /* Allocate space for the constant values */ - si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef)); + ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef)); /* Load the resource descriptor */ - si_shader_ctx->const_buffers[buf] = - build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf)); + ctx->const_buffers[buf] = + build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf)); /* Load the constants, we rely on the code sinking to do the rest */ for (i = 0; i < num_const * 4; ++i) { - si_shader_ctx->constants[buf][i] = + ctx->constants[buf][i] = buffer_load_const(gallivm->builder, - si_shader_ctx->const_buffers[buf], + ctx->const_buffers[buf], lp_build_const_int32(gallivm, i * 4), - bld_base->base.elem_type); + ctx->f32); } } } -static void preload_samplers(struct si_shader_context *si_shader_ctx) +static void preload_samplers(struct si_shader_context *ctx) { - struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; - struct gallivm_state * gallivm = bld_base->base.gallivm; - const struct tgsi_shader_info * info = bld_base->info; - + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_shader_info *info = bld_base->info; unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1; - - LLVMValueRef res_ptr, samp_ptr; LLVMValueRef offset; if (num_samplers == 0) return; - res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_VIEWS); - samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER_STATES); - /* Load the resources and samplers, we rely on the code sinking to do the rest */ for (i = 0; i < num_samplers; ++i) { /* Resource */ offset = lp_build_const_int32(gallivm, i); - si_shader_ctx->sampler_views[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset); - - /* Sampler */ - offset = lp_build_const_int32(gallivm, i); - si_shader_ctx->sampler_states[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset); + ctx->sampler_views[i] = + get_sampler_desc(ctx, offset, DESC_IMAGE); /* FMASK resource */ - if (info->is_msaa_sampler[i]) { - offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i); - si_shader_ctx->sampler_views[SI_FMASK_TEX_OFFSET + i] = - build_indexed_load_const(si_shader_ctx, res_ptr, offset); - } + if (info->is_msaa_sampler[i]) + ctx->fmasks[i] = + get_sampler_desc(ctx, offset, DESC_FMASK); + else + ctx->sampler_states[i] = + get_sampler_desc(ctx, offset, DESC_SAMPLER); } } -static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx) +static void preload_streamout_buffers(struct si_shader_context *ctx) { - struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; - struct gallivm_state * gallivm = bld_base->base.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; unsigned i; /* Streamout can only be used if the shader is compiled as VS. */ - if (!si_shader_ctx->shader->selector->so.num_outputs || - (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && - (si_shader_ctx->shader->key.vs.as_es || - si_shader_ctx->shader->key.vs.as_ls)) || - (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL && - si_shader_ctx->shader->key.tes.as_es)) + if (!ctx->shader->selector->so.num_outputs || + (ctx->type == TGSI_PROCESSOR_VERTEX && + (ctx->shader->key.vs.as_es || + ctx->shader->key.vs.as_ls)) || + (ctx->type == TGSI_PROCESSOR_TESS_EVAL && + ctx->shader->key.tes.as_es)) return; - LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS); /* Load the resources, we rely on the code sinking to do the rest */ for (i = 0; i < 4; ++i) { - if (si_shader_ctx->shader->selector->so.stride[i]) { + if (ctx->shader->selector->so.stride[i]) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_SO_BUF_OFFSET + i); - si_shader_ctx->so_buffers[i] = build_indexed_load_const(si_shader_ctx, buf_ptr, offset); + ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset); } } } @@ -3671,42 +4184,85 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx) * Load ESGS and GSVS ring buffer resource descriptors and save the variables * for later use. */ -static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) +static void preload_ring_buffers(struct si_shader_context *ctx) { struct gallivm_state *gallivm = - si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm; + ctx->radeon_bld.soa.bld_base.base.gallivm; - LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS); - if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && - si_shader_ctx->shader->key.vs.as_es) || - (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL && - si_shader_ctx->shader->key.tes.as_es) || - si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) { + if ((ctx->type == TGSI_PROCESSOR_VERTEX && + ctx->shader->key.vs.as_es) || + (ctx->type == TGSI_PROCESSOR_TESS_EVAL && + ctx->shader->key.tes.as_es) || + ctx->type == TGSI_PROCESSOR_GEOMETRY) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS); - si_shader_ctx->esgs_ring = - build_indexed_load_const(si_shader_ctx, buf_ptr, offset); + ctx->esgs_ring = + build_indexed_load_const(ctx, buf_ptr, offset); } - if (si_shader_ctx->shader->is_gs_copy_shader) { + if (ctx->is_gs_copy_shader) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); - si_shader_ctx->gsvs_ring[0] = - build_indexed_load_const(si_shader_ctx, buf_ptr, offset); + ctx->gsvs_ring[0] = + build_indexed_load_const(ctx, buf_ptr, offset); } - if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { int i; for (i = 0; i < 4; i++) { LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i); - si_shader_ctx->gsvs_ring[i] = - build_indexed_load_const(si_shader_ctx, buf_ptr, offset); + ctx->gsvs_ring[i] = + build_indexed_load_const(ctx, buf_ptr, offset); } } } +static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, + LLVMValueRef param_sampler_views, + unsigned param_pos_fixed_pt) +{ + struct lp_build_tgsi_context *bld_base = + &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_emit_data result = {}; + struct tgsi_full_instruction inst = {}; + LLVMValueRef desc, sampler_index, address[2], pix; + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5); + address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5); + + /* Load the sampler view descriptor. */ + sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER); + desc = get_sampler_desc_custom(ctx, param_sampler_views, + sampler_index, DESC_IMAGE); + + /* Load the texel. */ + inst.Instruction.Opcode = TGSI_OPCODE_TXF; + inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */ + result.inst = &inst; + set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF, + inst.Texture.Texture, + desc, NULL, address, ARRAY_SIZE(address), 0xf); + build_tex_intrinsic(&tex_action, bld_base, &result); + + /* Kill the thread accordingly. */ + pix = LLVMBuildExtractElement(gallivm->builder, result.output[0], + lp_build_const_int32(gallivm, 3), ""); + pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix); + pix = LLVMBuildFNeg(gallivm->builder, pix, ""); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + LLVMVoidTypeInContext(gallivm->context), + &pix, 1, 0); +} + void si_shader_binary_read_config(struct radeon_shader_binary *binary, struct si_shader_config *conf, unsigned symbol_offset) @@ -3742,6 +4298,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary, case R_0286CC_SPI_PS_INPUT_ENA: conf->spi_ps_input_ena = value; break; + case R_0286D0_SPI_PS_INPUT_ADDR: + conf->spi_ps_input_addr = value; + break; case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ @@ -3749,10 +4308,20 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary, G_00B860_WAVESIZE(value) * 256 * 4 * 1; break; default: - fprintf(stderr, "Warning: Compiler emitted unknown " - "config register: 0x%x\n", reg); + { + static bool printed; + + if (!printed) { + fprintf(stderr, "Warning: LLVM emitted unknown " + "config register: 0x%x\n", reg); + printed = true; + } + } break; } + + if (!conf->spi_ps_input_addr) + conf->spi_ps_input_addr = conf->spi_ps_input_ena; } } @@ -3779,41 +4348,70 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, } } +static unsigned si_get_shader_binary_size(struct si_shader *shader) +{ + unsigned size = shader->binary.code_size; + + if (shader->prolog) + size += shader->prolog->binary.code_size; + if (shader->epilog) + size += shader->epilog->binary.code_size; + return size; +} + int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) { - const struct radeon_shader_binary *binary = &shader->binary; - unsigned code_size = binary->code_size + binary->rodata_size; + const struct radeon_shader_binary *prolog = + shader->prolog ? &shader->prolog->binary : NULL; + const struct radeon_shader_binary *epilog = + shader->epilog ? &shader->epilog->binary : NULL; + const struct radeon_shader_binary *mainb = &shader->binary; + unsigned bo_size = si_get_shader_binary_size(shader) + + (!epilog ? mainb->rodata_size : 0); unsigned char *ptr; + assert(!prolog || !prolog->rodata_size); + assert((!prolog && !epilog) || !mainb->rodata_size); + assert(!epilog || !epilog->rodata_size); + r600_resource_reference(&shader->bo, NULL); shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE, - code_size); + bo_size); if (!shader->bo) return -ENOMEM; + /* Upload. */ ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL, PIPE_TRANSFER_READ_WRITE); - util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size); - if (binary->rodata_size > 0) { - ptr += binary->code_size; - util_memcpy_cpu_to_le32(ptr, binary->rodata, - binary->rodata_size); + + if (prolog) { + util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size); + ptr += prolog->code_size; } + util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size); + ptr += mainb->code_size; + + if (epilog) + util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size); + else if (mainb->rodata_size > 0) + util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size); + sscreen->b.ws->buffer_unmap(shader->bo->buf); return 0; } static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary, - struct pipe_debug_callback *debug) + struct pipe_debug_callback *debug, + const char *name) { char *line, *p; unsigned i, count; if (binary->disasm_string) { - fprintf(stderr, "\nShader Disassembly:\n\n"); - fprintf(stderr, "%s\n", binary->disasm_string); + fprintf(stderr, "Shader %s disassembly:\n", name); + fprintf(stderr, "%s", binary->disasm_string); if (debug && debug->debug_message) { /* Very long debug messages are cut off, so send the @@ -3843,7 +4441,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary "Shader Disassembly End"); } } else { - fprintf(stderr, "SI CODE:\n"); + fprintf(stderr, "Shader %s binary:\n", name); for (i = 0; i < binary->code_size; i += 4) { fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3], binary->code[i + 2], @@ -3854,33 +4452,128 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader_config *conf, + unsigned num_inputs, unsigned code_size, struct pipe_debug_callback *debug, unsigned processor) { + unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256; + unsigned lds_per_wave = 0; + unsigned max_simd_waves = 10; + + /* Compute LDS usage for PS. */ + if (processor == TGSI_PROCESSOR_FRAGMENT) { + /* The minimum usage per wave is (num_inputs * 36). The maximum + * usage is (num_inputs * 36 * 16). + * We can get anything in between and it varies between waves. + * + * Other stages don't know the size at compile time or don't + * allocate LDS per wave, but instead they do it per thread group. + */ + lds_per_wave = conf->lds_size * lds_increment + + align(num_inputs * 36, lds_increment); + } + + /* Compute the per-SIMD wave counts. */ + if (conf->num_sgprs) { + if (sscreen->b.chip_class >= VI) + max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs); + else + max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs); + } + + if (conf->num_vgprs) + max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); + + /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD + * that PS can use. + */ + if (lds_per_wave) + max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); + if (r600_can_dump_shader(&sscreen->b, processor)) { + if (processor == TGSI_PROCESSOR_FRAGMENT) { + fprintf(stderr, "*** SHADER CONFIG ***\n" + "SPI_PS_INPUT_ADDR = 0x%04x\n" + "SPI_PS_INPUT_ENA = 0x%04x\n", + conf->spi_ps_input_addr, conf->spi_ps_input_ena); + } + fprintf(stderr, "*** SHADER STATS ***\n" - "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n" - "Scratch: %d bytes per wave\n********************\n", + "SGPRS: %d\n" + "VGPRS: %d\n" + "Code Size: %d bytes\n" + "LDS: %d blocks\n" + "Scratch: %d bytes per wave\n" + "Max Waves: %d\n" + "********************\n", conf->num_sgprs, conf->num_vgprs, code_size, - conf->lds_size, conf->scratch_bytes_per_wave); + conf->lds_size, conf->scratch_bytes_per_wave, + max_simd_waves); } pipe_debug_message(debug, SHADER_INFO, - "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d", + "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " + "LDS: %d Scratch: %d Max Waves: %d", conf->num_sgprs, conf->num_vgprs, code_size, - conf->lds_size, conf->scratch_bytes_per_wave); + conf->lds_size, conf->scratch_bytes_per_wave, + max_simd_waves); +} + +static const char *si_get_shader_name(struct si_shader *shader, + unsigned processor) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + if (shader->key.vs.as_es) + return "Vertex Shader as ES"; + else if (shader->key.vs.as_ls) + return "Vertex Shader as LS"; + else + return "Vertex Shader as VS"; + case TGSI_PROCESSOR_TESS_CTRL: + return "Tessellation Control Shader"; + case TGSI_PROCESSOR_TESS_EVAL: + if (shader->key.tes.as_es) + return "Tessellation Evaluation Shader as ES"; + else + return "Tessellation Evaluation Shader as VS"; + case TGSI_PROCESSOR_GEOMETRY: + if (shader->gs_copy_shader == NULL) + return "GS Copy Shader as VS"; + else + return "Geometry Shader"; + case TGSI_PROCESSOR_FRAGMENT: + return "Pixel Shader"; + case TGSI_PROCESSOR_COMPUTE: + return "Compute Shader"; + default: + return "Unknown Shader"; + } } void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor) { - if (r600_can_dump_shader(&sscreen->b, processor)) - if (!(sscreen->b.debug_flags & DBG_NO_ASM)) - si_shader_dump_disassembly(&shader->binary, debug); + if (r600_can_dump_shader(&sscreen->b, processor) && + !(sscreen->b.debug_flags & DBG_NO_ASM)) { + fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor)); + + if (shader->prolog) + si_shader_dump_disassembly(&shader->prolog->binary, + debug, "prolog"); + + si_shader_dump_disassembly(&shader->binary, debug, "main"); + + if (shader->epilog) + si_shader_dump_disassembly(&shader->epilog->binary, + debug, "epilog"); + fprintf(stderr, "\n"); + } si_shader_dump_stats(sscreen, &shader->config, - shader->binary.code_size, debug, processor); + shader->selector ? shader->selector->info.num_inputs : 0, + si_get_shader_binary_size(shader), debug, processor); } int si_compile_llvm(struct si_screen *sscreen, @@ -3889,7 +4582,8 @@ int si_compile_llvm(struct si_screen *sscreen, LLVMTargetMachineRef tm, LLVMModuleRef mod, struct pipe_debug_callback *debug, - unsigned processor) + unsigned processor, + const char *name) { int r = 0; unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations); @@ -3897,8 +4591,11 @@ int si_compile_llvm(struct si_screen *sscreen, if (r600_can_dump_shader(&sscreen->b, processor)) { fprintf(stderr, "radeonsi: Compiling shader %d\n", count); - if (!(sscreen->b.debug_flags & DBG_NO_IR)) + if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) { + fprintf(stderr, "%s LLVM IR:\n\n", name); LLVMDumpModule(mod); + fprintf(stderr, "\n"); + } } if (!si_replace_shader(count, binary)) { @@ -3911,24 +4608,49 @@ int si_compile_llvm(struct si_screen *sscreen, si_shader_binary_read_config(binary, conf, 0); + /* Enable 64-bit and 16-bit denormals, because there is no performance + * cost. + * + * If denormals are enabled, all floating-point output modifiers are + * ignored. + * + * Don't enable denormals for 32-bit floats, because: + * - Floating-point output modifiers would be ignored by the hw. + * - Some opcodes don't support denormals, such as v_mad_f32. We would + * have to stop using those. + * - SI & CI would be very slow. + */ + conf->float_mode |= V_00B028_FP_64_DENORMS; + FREE(binary->config); FREE(binary->global_symbol_offsets); binary->config = NULL; binary->global_symbol_offsets = NULL; + + /* Some shaders can't have rodata because their binaries can be + * concatenated. + */ + if (binary->rodata_size && + (processor == TGSI_PROCESSOR_VERTEX || + processor == TGSI_PROCESSOR_TESS_CTRL || + processor == TGSI_PROCESSOR_TESS_EVAL || + processor == TGSI_PROCESSOR_FRAGMENT)) { + fprintf(stderr, "radeonsi: The shader can't have rodata."); + return -EINVAL; + } + return r; } /* Generate code for the hardware VS shader stage to go with a geometry shader */ static int si_generate_gs_copy_shader(struct si_screen *sscreen, - struct si_shader_context *si_shader_ctx, - struct si_shader *gs, bool dump, + struct si_shader_context *ctx, + struct si_shader *gs, struct pipe_debug_callback *debug) { - struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm; - struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base; - struct lp_build_context *base = &bld_base->base; + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct lp_build_context *uint = &bld_base->uint_bld; - struct si_shader *shader = si_shader_ctx->shader; struct si_shader_output_values *outputs; struct tgsi_shader_info *gsinfo = &gs->selector->info; LLVMValueRef args[9]; @@ -3936,20 +4658,19 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); - si_shader_ctx->type = TGSI_PROCESSOR_VERTEX; - shader->is_gs_copy_shader = true; - - radeon_llvm_context_init(&si_shader_ctx->radeon_bld); + si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm); + ctx->type = TGSI_PROCESSOR_VERTEX; + ctx->is_gs_copy_shader = true; - create_meta_data(si_shader_ctx); - create_function(si_shader_ctx); - preload_streamout_buffers(si_shader_ctx); - preload_ring_buffers(si_shader_ctx); + create_meta_data(ctx); + create_function(ctx); + preload_streamout_buffers(ctx); + preload_ring_buffers(ctx); - args[0] = si_shader_ctx->gsvs_ring[0]; + args[0] = ctx->gsvs_ring[0]; args[1] = lp_build_mul_imm(uint, - LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, - si_shader_ctx->param_vertex_id), + LLVMGetParam(ctx->radeon_bld.main_fn, + ctx->param_vertex_id), 4); args[3] = uint->zero; args[4] = uint->one; /* OFFEN */ @@ -3974,31 +4695,37 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, LLVMBuildBitCast(gallivm->builder, lp_build_intrinsic(gallivm->builder, "llvm.SI.buffer.load.dword.i32.i32", - LLVMInt32TypeInContext(gallivm->context), - args, 9, + ctx->i32, args, 9, LLVMReadOnlyAttribute | LLVMNoUnwindAttribute), - base->elem_type, ""); + ctx->f32, ""); } } si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); - radeon_llvm_finalize_module(&si_shader_ctx->radeon_bld); + LLVMBuildRet(gallivm->builder, ctx->return_value); - if (dump) - fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n"); + /* Dump LLVM IR before any optimization passes */ + if (sscreen->b.debug_flags & DBG_PREOPT_IR && + r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY)) + LLVMDumpModule(bld_base->base.gallivm->module); - r = si_compile_llvm(sscreen, &si_shader_ctx->shader->binary, - &si_shader_ctx->shader->config, si_shader_ctx->tm, + radeon_llvm_finalize_module(&ctx->radeon_bld); + + r = si_compile_llvm(sscreen, &ctx->shader->binary, + &ctx->shader->config, ctx->tm, bld_base->base.gallivm->module, - debug, TGSI_PROCESSOR_GEOMETRY); + debug, TGSI_PROCESSOR_GEOMETRY, + "GS Copy Shader"); if (!r) { - si_shader_dump(sscreen, si_shader_ctx->shader, debug, + if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY)) + fprintf(stderr, "GS Copy Shader:\n"); + si_shader_dump(sscreen, ctx->shader, debug, TGSI_PROCESSOR_GEOMETRY); - r = si_shader_binary_upload(sscreen, si_shader_ctx->shader); + r = si_shader_binary_upload(sscreen, ctx->shader); } - radeon_llvm_dispose(&si_shader_ctx->radeon_bld); + radeon_llvm_dispose(&ctx->radeon_bld); FREE(outputs); return r; @@ -4013,35 +4740,38 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) switch (shader) { case PIPE_SHADER_VERTEX: fprintf(f, " instance_divisors = {"); - for (i = 0; i < Elements(key->vs.instance_divisors); i++) + for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++) fprintf(f, !i ? "%u" : ", %u", - key->vs.instance_divisors[i]); + key->vs.prolog.instance_divisors[i]); fprintf(f, "}\n"); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); - fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); + fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id); break; case PIPE_SHADER_TESS_CTRL: - fprintf(f, " prim_mode = %u\n", key->tcs.prim_mode); + fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode); break; case PIPE_SHADER_TESS_EVAL: fprintf(f, " as_es = %u\n", key->tes.as_es); - fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); + fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id); break; case PIPE_SHADER_GEOMETRY: break; case PIPE_SHADER_FRAGMENT: - fprintf(f, " export_16bpc = 0x%X\n", key->ps.export_16bpc); - fprintf(f, " last_cbuf = %u\n", key->ps.last_cbuf); - fprintf(f, " color_two_side = %u\n", key->ps.color_two_side); - fprintf(f, " alpha_func = %u\n", key->ps.alpha_func); - fprintf(f, " alpha_to_one = %u\n", key->ps.alpha_to_one); - fprintf(f, " poly_stipple = %u\n", key->ps.poly_stipple); - fprintf(f, " clamp_color = %u\n", key->ps.clamp_color); + fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side); + fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple); + fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp); + fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format); + fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8); + fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf); + fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func); + fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one); + fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing); + fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color); break; default: @@ -4049,47 +4779,39 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) } } -int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, - struct si_shader *shader, - struct pipe_debug_callback *debug) +static void si_init_shader_ctx(struct si_shader_context *ctx, + struct si_screen *sscreen, + struct si_shader *shader, + LLVMTargetMachineRef tm) { - struct si_shader_selector *sel = shader->selector; - struct tgsi_token *tokens = sel->tokens; - struct si_shader_context si_shader_ctx; - struct lp_build_tgsi_context * bld_base; - struct tgsi_shader_info stipple_shader_info; - LLVMModuleRef mod; - int r = 0; - bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT && - shader->key.ps.poly_stipple; - bool dump = r600_can_dump_shader(&sscreen->b, sel->info.processor); - - if (poly_stipple) { - tokens = util_pstipple_create_fragment_shader(tokens, NULL, - SI_POLY_STIPPLE_SAMPLER, - TGSI_FILE_INPUT); - tgsi_scan_shader(tokens, &stipple_shader_info); - } - - /* Dump TGSI code before doing TGSI->LLVM conversion in case the - * conversion fails. */ - if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) { - si_dump_shader_key(sel->type, &shader->key, stderr); - tgsi_dump(tokens, 0); - si_dump_streamout(&sel->so); - } - - assert(shader->nparam == 0); - - memset(&si_shader_ctx, 0, sizeof(si_shader_ctx)); - radeon_llvm_context_init(&si_shader_ctx.radeon_bld); - bld_base = &si_shader_ctx.radeon_bld.soa.bld_base; - - if (sel->type != PIPE_SHADER_COMPUTE) - shader->dx10_clamp_mode = true; - - shader->uses_instanceid = sel->info.uses_instanceid; - bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info; + struct lp_build_tgsi_context *bld_base; + + memset(ctx, 0, sizeof(*ctx)); + radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--"); + ctx->tm = tm; + ctx->screen = sscreen; + if (shader && shader->selector) + ctx->type = shader->selector->info.processor; + else + ctx->type = -1; + ctx->shader = shader; + + ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128); + ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context); + ctx->v16i8 = LLVMVectorType(ctx->i8, 16); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); + ctx->v4i32 = LLVMVectorType(ctx->i32, 4); + ctx->v4f32 = LLVMVectorType(ctx->f32, 4); + ctx->v8i32 = LLVMVectorType(ctx->i32, 8); + + bld_base = &ctx->radeon_bld.soa.bld_base; + if (shader && shader->selector) + bld_base->info = &shader->selector->info; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; @@ -4119,22 +4841,44 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive; bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; - if (HAVE_LLVM >= 0x0306) { - bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32"; - bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; + bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem; + bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32"; + bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem; + bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; +} + +int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug) +{ + struct si_shader_selector *sel = shader->selector; + struct si_shader_context ctx; + struct lp_build_tgsi_context *bld_base; + LLVMModuleRef mod; + int r = 0; + + /* Dump TGSI code before doing TGSI->LLVM conversion in case the + * conversion fails. */ + if (r600_can_dump_shader(&sscreen->b, sel->info.processor) && + !(sscreen->b.debug_flags & DBG_NO_TGSI)) { + si_dump_shader_key(sel->type, &shader->key, stderr); + tgsi_dump(sel->tokens, 0); + si_dump_streamout(&sel->so); } - si_shader_ctx.radeon_bld.load_system_value = declare_system_value; - si_shader_ctx.shader = shader; - si_shader_ctx.type = tgsi_get_processor_type(tokens); - si_shader_ctx.screen = sscreen; - si_shader_ctx.tm = tm; + si_init_shader_ctx(&ctx, sscreen, shader, tm); + ctx.is_monolithic = is_monolithic; + + shader->uses_instanceid = sel->info.uses_instanceid; + + bld_base = &ctx.radeon_bld.soa.bld_base; + ctx.radeon_bld.load_system_value = declare_system_value; - switch (si_shader_ctx.type) { + switch (ctx.type) { case TGSI_PROCESSOR_VERTEX: - si_shader_ctx.radeon_bld.load_input = declare_input_vs; + ctx.radeon_bld.load_input = declare_input_vs; if (shader->key.vs.as_ls) bld_base->emit_epilogue = si_llvm_emit_ls_epilogue; else if (shader->key.vs.as_es) @@ -4160,62 +4904,112 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, bld_base->emit_epilogue = si_llvm_emit_gs_epilogue; break; case TGSI_PROCESSOR_FRAGMENT: - si_shader_ctx.radeon_bld.load_input = declare_input_fs; - bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + ctx.radeon_bld.load_input = declare_input_fs; + if (is_monolithic) + bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + else + bld_base->emit_epilogue = si_llvm_return_fs_outputs; break; default: assert(!"Unsupported shader type"); return -1; } - create_meta_data(&si_shader_ctx); - create_function(&si_shader_ctx); - preload_constants(&si_shader_ctx); - preload_samplers(&si_shader_ctx); - preload_streamout_buffers(&si_shader_ctx); - preload_ring_buffers(&si_shader_ctx); + create_meta_data(&ctx); + create_function(&ctx); + preload_constants(&ctx); + preload_samplers(&ctx); + preload_streamout_buffers(&ctx); + preload_ring_buffers(&ctx); + + if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT && + shader->key.ps.prolog.poly_stipple) { + LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + si_llvm_emit_polygon_stipple(&ctx, views, + SI_PARAM_POS_FIXED_PT); + } - if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { int i; for (i = 0; i < 4; i++) { - si_shader_ctx.gs_next_vertex[i] = + ctx.gs_next_vertex[i] = lp_build_alloca(bld_base->base.gallivm, - bld_base->uint_bld.elem_type, ""); + ctx.i32, ""); } } - if (!lp_build_tgsi_llvm(bld_base, tokens)) { + if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); goto out; } - radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld); - + LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value); mod = bld_base->base.gallivm->module; + + /* Dump LLVM IR before any optimization passes */ + if (sscreen->b.debug_flags & DBG_PREOPT_IR && + r600_can_dump_shader(&sscreen->b, ctx.type)) + LLVMDumpModule(mod); + + radeon_llvm_finalize_module(&ctx.radeon_bld); + r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, - mod, debug, si_shader_ctx.type); + mod, debug, ctx.type, "TGSI shader"); if (r) { fprintf(stderr, "LLVM failed to compile shader\n"); goto out; } - si_shader_dump(sscreen, shader, debug, si_shader_ctx.type); - - r = si_shader_binary_upload(sscreen, shader); - if (r) { - fprintf(stderr, "LLVM failed to upload shader\n"); - goto out; + radeon_llvm_dispose(&ctx.radeon_bld); + + /* Calculate the number of fragment input VGPRs. */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + shader->num_input_vgprs = 0; + shader->face_vgpr_index = -1; + + if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 2; + if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 3; + if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 2; + if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { + shader->face_vgpr_index = shader->num_input_vgprs; + shader->num_input_vgprs += 1; + } + if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; + if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) + shader->num_input_vgprs += 1; } - radeon_llvm_dispose(&si_shader_ctx.radeon_bld); - - if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { shader->gs_copy_shader = CALLOC_STRUCT(si_shader); shader->gs_copy_shader->selector = shader->selector; - shader->gs_copy_shader->key = shader->key; - si_shader_ctx.shader = shader->gs_copy_shader; - if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx, - shader, dump, debug))) { + ctx.shader = shader->gs_copy_shader; + if ((r = si_generate_gs_copy_shader(sscreen, &ctx, + shader, debug))) { free(shader->gs_copy_shader); shader->gs_copy_shader = NULL; goto out; @@ -4224,18 +5018,966 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, out: for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++) - FREE(si_shader_ctx.constants[i]); - if (poly_stipple) - tgsi_free_tokens(tokens); + FREE(ctx.constants[i]); return r; } -void si_shader_destroy_binary(struct radeon_shader_binary *binary) +/** + * Create, compile and return a shader part (prolog or epilog). + * + * \param sscreen screen + * \param list list of shader parts of the same category + * \param key shader part key + * \param tm LLVM target machine + * \param debug debug callback + * \param compile the callback responsible for compilation + * \return non-NULL on success + */ +static struct si_shader_part * +si_get_shader_part(struct si_screen *sscreen, + struct si_shader_part **list, + union si_shader_part_key *key, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + bool (*compile)(struct si_screen *, + LLVMTargetMachineRef, + struct pipe_debug_callback *, + struct si_shader_part *)) +{ + struct si_shader_part *result; + + pipe_mutex_lock(sscreen->shader_parts_mutex); + + /* Find existing. */ + for (result = *list; result; result = result->next) { + if (memcmp(&result->key, key, sizeof(*key)) == 0) { + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return result; + } + } + + /* Compile a new one. */ + result = CALLOC_STRUCT(si_shader_part); + result->key = *key; + if (!compile(sscreen, tm, debug, result)) { + FREE(result); + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return NULL; + } + + result->next = *list; + *list = result; + pipe_mutex_unlock(sscreen->shader_parts_mutex); + return result; +} + +/** + * Create a vertex shader prolog. + * + * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). + * All inputs are returned unmodified. The vertex load indices are + * stored after them, which will used by the API VS for fetching inputs. + * + * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: + * input_v0, + * input_v1, + * input_v2, + * input_v3, + * (VertexID + BaseVertex), + * (InstanceID + StartInstance), + * (InstanceID / 2 + StartInstance) + */ +static bool si_compile_vs_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params, *returns; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_VERTEX; + ctx.param_vertex_id = key->vs_prolog.num_input_sgprs; + ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3; + + /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ + params = alloca((key->vs_prolog.num_input_sgprs + 4) * + sizeof(LLVMTypeRef)); + returns = alloca((key->vs_prolog.num_input_sgprs + 4 + + key->vs_prolog.last_input + 1) * + sizeof(LLVMTypeRef)); + num_params = 0; + num_returns = 0; + + /* Declare input and output SGPRs. */ + num_params = 0; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + params[num_params++] = ctx.i32; + returns[num_returns++] = ctx.i32; + } + last_sgpr = num_params - 1; + + /* 4 preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < 4; i++) { + params[num_params++] = ctx.i32; + returns[num_returns++] = ctx.f32; + } + + /* Vertex load indices. */ + for (i = 0; i <= key->vs_prolog.last_input; i++) + returns[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, returns, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + for (i = num_params - 4; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Compute vertex load indices from instance divisors. */ + for (i = 0; i <= key->vs_prolog.last_input; i++) { + unsigned divisor = key->vs_prolog.states.instance_divisors[i]; + LLVMValueRef index; + + if (divisor) { + /* InstanceID / Divisor + StartInstance */ + index = get_instance_index_for_fetch(&ctx.radeon_bld, + SI_SGPR_START_INSTANCE, + divisor); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(gallivm->builder, + LLVMGetParam(func, ctx.param_vertex_id), + LLVMGetParam(func, SI_SGPR_BASE_VERTEX), ""); + } + + index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, ""); + ret = LLVMBuildInsertValue(gallivm->builder, ret, index, + num_params++, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Vertex Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the vertex shader epilog. This is also used by the tessellation + * evaluation shader compiled as VS. + * + * The input is PrimitiveID. + * + * If PrimitiveID is required by the pixel shader, export it. + * Otherwise, do nothing. + */ +static bool si_compile_vs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[5]; + int num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, NULL, tm); + ctx.type = TGSI_PROCESSOR_VERTEX; + + /* Declare input VGPRs. */ + num_params = key->vs_epilog.states.export_prim_id ? + (VS_EPILOG_PRIMID_LOC + 1) : 0; + assert(num_params <= ARRAY_SIZE(params)); + + for (i = 0; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + -1, -1); + + /* Emit exports. */ + if (key->vs_epilog.states.export_prim_id) { + struct lp_build_context *base = &bld_base->base; + struct lp_build_context *uint = &bld_base->uint_bld; + LLVMValueRef args[9]; + + args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */ + args[1] = uint->zero; /* whether the EXEC mask is valid */ + args[2] = uint->zero; /* DONE bit */ + args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM + + key->vs_epilog.prim_id_param_offset); + args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ + args[5] = LLVMGetParam(ctx.radeon_bld.main_fn, + VS_EPILOG_PRIMID_LOC); /* X */ + args[6] = uint->undef; /* Y */ + args[7] = uint->undef; /* Z */ + args[8] = uint->undef; /* W */ + + lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9, 0); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Vertex Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Create & compile a vertex shader epilog. This a helper used by VS and TES. + */ +static bool si_get_vs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug, + struct si_vs_epilog_bits *states) +{ + union si_shader_part_key epilog_key; + + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.vs_epilog.states = *states; + + /* Set up the PrimitiveID output. */ + if (shader->key.vs.epilog.export_prim_id) { + unsigned index = shader->selector->info.num_outputs; + unsigned offset = shader->nr_param_exports++; + + epilog_key.vs_epilog.prim_id_param_offset = offset; + shader->vs_output_param_offset[index] = offset; + } + + shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs, + &epilog_key, tm, debug, + si_compile_vs_epilog); + return shader->epilog != NULL; +} + +/** + * Select and compile (or reuse) vertex shader parts (prolog & epilog). + */ +static bool si_shader_select_vs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.vs_prolog.states = shader->key.vs.prolog; + prolog_key.vs_prolog.num_input_sgprs = shader->num_input_sgprs; + prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; + + /* The prolog is a no-op if there are no inputs. */ + if (info->num_inputs) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->vs_prologs, + &prolog_key, tm, debug, + si_compile_vs_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + if (!shader->key.vs.as_es && !shader->key.vs.as_ls && + !si_get_vs_epilog(sscreen, tm, shader, debug, + &shader->key.vs.epilog)) + return false; + + /* Set the instanceID flag. */ + for (i = 0; i < info->num_inputs; i++) + if (prolog_key.vs_prolog.states.instance_divisors[i]) + shader->uses_instanceid = true; + + return true; +} + +/** + * Select and compile (or reuse) TES parts (epilog). + */ +static bool si_shader_select_tes_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + if (shader->key.tes.as_es) + return true; + + /* TES compiled as VS. */ + return si_get_vs_epilog(sscreen, tm, shader, debug, + &shader->key.tes.epilog); +} + +/** + * Compile the TCS epilog. This writes tesselation factors to memory based on + * the output primitive type of the tesselator (determined by TES). + */ +static bool si_compile_tcs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16]; + LLVMValueRef func; + int last_array_pointer, last_sgpr, num_params; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_TESS_CTRL; + shader.key.tcs.epilog = key->tcs_epilog.states; + + /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ + params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS); + last_array_pointer = SI_PARAM_RW_BUFFERS; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; + params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32; + last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; + num_params = last_sgpr + 1; + + params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */ + params[num_params++] = ctx.i32; /* invocation ID within the patch */ + params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */ + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + declare_tess_lds(&ctx); + func = ctx.radeon_bld.main_fn; + + si_write_tess_factors(bld_base, + LLVMGetParam(func, last_sgpr + 1), + LLVMGetParam(func, last_sgpr + 2), + LLVMGetParam(func, last_sgpr + 3)); + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Tessellation Control Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) TCS parts (epilog). + */ +static bool si_shader_select_tcs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + union si_shader_part_key epilog_key; + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.tcs.epilog; + + shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, + &epilog_key, tm, debug, + si_compile_tcs_epilog); + return shader->epilog != NULL; +} + +/** + * Compile the pixel shader prolog. This handles: + * - two-side color selection and interpolation + * - overriding interpolation parameters for the API PS + * - polygon stippling + * + * All preloaded SGPRs and VGPRs are passed through unmodified unless they are + * overriden by other states. (e.g. per-sample interpolation) + * Interpolated colors are stored after the preloaded VGPRs. + */ +static bool si_compile_ps_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i, num_color_channels; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.prolog = key->ps_prolog.states; + + /* Number of inputs + 8 color elements. */ + params = alloca((key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs + 8) * + sizeof(LLVMTypeRef)); + + /* Declare inputs. */ + num_params = 0; + for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) + params[num_params++] = ctx.i32; + last_sgpr = num_params - 1; + + for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) + params[num_params++] = ctx.f32; + + /* Declare outputs (same as inputs + add colors if needed) */ + num_returns = num_params; + num_color_channels = util_bitcount(key->ps_prolog.colors_read); + for (i = 0; i < num_color_channels; i++) + params[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, params, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Polygon stippling. */ + if (key->ps_prolog.states.poly_stipple) { + /* POS_FIXED_PT is always last. */ + unsigned pos = key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs - 1; + LLVMValueRef ptr[2], views; + + /* Get the pointer to sampler views. */ + ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS); + ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1); + views = lp_build_gather_values(gallivm, ptr, 2); + views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, ""); + views = LLVMBuildIntToPtr(gallivm->builder, views, + const_array(ctx.v8i32, SI_NUM_SAMPLERS), ""); + + si_llvm_emit_polygon_stipple(&ctx, views, pos); + } + + /* Interpolate colors. */ + for (i = 0; i < 2; i++) { + unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; + unsigned face_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.face_vgpr_index; + LLVMValueRef interp[2], color[4]; + LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; + + if (!writemask) + continue; + + /* If the interpolation qualifier is not CONSTANT (-1). */ + if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { + unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.color_interp_vgpr_index[i]; + + interp[0] = LLVMGetParam(func, interp_vgpr); + interp[1] = LLVMGetParam(func, interp_vgpr + 1); + interp_ij = lp_build_gather_values(gallivm, interp, 2); + interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij, + ctx.v2i32, ""); + } + + /* Use the absolute location of the input. */ + prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + + if (key->ps_prolog.states.color_two_side) { + face = LLVMGetParam(func, face_vgpr); + face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, ""); + } + + interp_fs_input(&ctx, + key->ps_prolog.color_attr_index[i], + TGSI_SEMANTIC_COLOR, i, + key->ps_prolog.num_interp_inputs, + key->ps_prolog.colors_read, interp_ij, + prim_mask, face, color); + + while (writemask) { + unsigned chan = u_bit_scan(&writemask); + ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan], + num_params++, ""); + } + } + + /* Force per-sample interpolation. */ + if (key->ps_prolog.states.force_persample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_sample[2], linear_sample[2]; + + /* Read PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + persp_sample[i] = LLVMGetParam(func, base + i); + /* Overwrite PERSP_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 2 + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 4 + i, ""); + /* Read LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + linear_sample[i] = LLVMGetParam(func, base + 6 + i); + /* Overwrite LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 8 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 10 + i, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the pixel shader epilog. This handles everything that must be + * emulated for pixel shader exports. (alpha-test, format conversions, etc) + */ +static bool si_compile_ps_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16+8*4+3]; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int last_array_pointer, last_sgpr, num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.epilog = key->ps_epilog.states; + + /* Declare input SGPRs. */ + params[SI_PARAM_RW_BUFFERS] = ctx.i64; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_ALPHA_REF] = ctx.f32; + last_array_pointer = -1; + last_sgpr = SI_PARAM_ALPHA_REF; + + /* Declare input VGPRs. */ + num_params = (last_sgpr + 1) + + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + + key->ps_epilog.writes_stencil + + key->ps_epilog.writes_samplemask; + + num_params = MAX2(num_params, + last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + assert(num_params <= ARRAY_SIZE(params)); + + for (i = last_sgpr + 1; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + /* Disable elimination of unused inputs. */ + radeon_llvm_add_attribute(ctx.radeon_bld.main_fn, + "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = last_sgpr + 1; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && + !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && + (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + si_export_mrt_color(bld_base, color, mrt, + num_params - 1, + mrt == last_color_export); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(bld_base, depth, stencil, samplemask); + else if (last_color_export == -1) + si_export_null(bld_base); + + /* Compile. */ + LLVMBuildRetVoid(gallivm->builder); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) pixel shader parts (prolog & epilog). + */ +static bool si_shader_select_ps_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + union si_shader_part_key epilog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.ps_prolog.states = shader->key.ps.prolog; + prolog_key.ps_prolog.colors_read = info->colors_read; + prolog_key.ps_prolog.num_input_sgprs = shader->num_input_sgprs; + prolog_key.ps_prolog.num_input_vgprs = shader->num_input_vgprs; + + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; + + if (shader->key.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + prolog_key.ps_prolog.num_interp_inputs = info->num_inputs; + prolog_key.ps_prolog.face_vgpr_index = shader->face_vgpr_index; + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + } + + for (i = 0; i < 2; i++) { + unsigned location = info->input_interpolate_loc[color[i]]; + + if (!(info->colors_read & (0xf << i*4))) + continue; + + prolog_key.ps_prolog.color_attr_index[i] = color[i]; + + /* Force per-sample interpolation for the colors here. */ + if (shader->key.ps.prolog.force_persample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + + switch (info->input_interpolate[color[i]]) { + case TGSI_INTERPOLATE_CONSTANT: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + default: + assert(0); + } + } + } + + /* The prolog is a no-op if these aren't set. */ + if (prolog_key.ps_prolog.colors_read || + prolog_key.ps_prolog.states.force_persample_interp || + prolog_key.ps_prolog.states.poly_stipple) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->ps_prologs, + &prolog_key, tm, debug, + si_compile_ps_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.ps_epilog.colors_written = info->colors_written; + epilog_key.ps_epilog.writes_z = info->writes_z; + epilog_key.ps_epilog.writes_stencil = info->writes_stencil; + epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask; + epilog_key.ps_epilog.states = shader->key.ps.epilog; + + shader->epilog = + si_get_shader_part(sscreen, &sscreen->ps_epilogs, + &epilog_key, tm, debug, + si_compile_ps_epilog); + if (!shader->epilog) + return false; + + /* Enable POS_FIXED_PT if polygon stippling is enabled. */ + if (shader->key.ps.prolog.poly_stipple) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); + } + + /* Set up the enable bits for per-sample shading if needed. */ + if (shader->key.ps.prolog.force_persample_interp) { + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + } + + /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && + !(shader->config.spi_ps_input_ena & 0xf)) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* At least one pair of interpolation weights must be enabled. */ + if (!(shader->config.spi_ps_input_ena & 0x7f)) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* The sample mask input is always enabled, because the API shader always + * passes it through to the epilog. Disable it here if it's unused. + */ + if (!shader->key.ps.epilog.poly_line_smoothing && + !shader->selector->info.reads_samplemask) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + + return true; +} + +int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) { - FREE(binary->code); - FREE(binary->rodata); - FREE(binary->relocs); - FREE(binary->disasm_string); + struct si_shader *mainp = shader->selector->main_shader_part; + int r; + + /* LS and ES are always compiled on demand. */ + if (!mainp || + (shader->selector->type == PIPE_SHADER_VERTEX && + (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (shader->selector->type == PIPE_SHADER_TESS_EVAL && + shader->key.tes.as_es)) { + /* Monolithic shader (compiled as a whole, has many variants, + * may take a long time to compile). + */ + r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug); + if (r) + return r; + } else { + /* The shader consists of 2-3 parts: + * + * - the middle part is the user shader, it has 1 variant only + * and it was compiled during the creation of the shader + * selector + * - the prolog part is inserted at the beginning + * - the epilog part is inserted at the end + * + * The prolog and epilog have many (but simple) variants. + */ + + /* Copy the compiled TGSI shader data over. */ + shader->is_binary_shared = true; + shader->binary = mainp->binary; + shader->config = mainp->config; + shader->num_input_sgprs = mainp->num_input_sgprs; + shader->num_input_vgprs = mainp->num_input_vgprs; + shader->face_vgpr_index = mainp->face_vgpr_index; + memcpy(shader->vs_output_param_offset, + mainp->vs_output_param_offset, + sizeof(mainp->vs_output_param_offset)); + shader->uses_instanceid = mainp->uses_instanceid; + shader->nr_pos_exports = mainp->nr_pos_exports; + shader->nr_param_exports = mainp->nr_param_exports; + + /* Select prologs and/or epilogs. */ + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (!si_shader_select_vs_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_TESS_CTRL: + if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_TESS_EVAL: + if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) + return -1; + break; + case PIPE_SHADER_FRAGMENT: + if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) + return -1; + + /* Make sure we have at least as many VGPRs as there + * are allocated inputs. + */ + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->num_input_vgprs); + break; + } + + /* Update SGPR and VGPR counts. */ + if (shader->prolog) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->prolog->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->prolog->config.num_vgprs); + } + if (shader->epilog) { + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, + shader->epilog->config.num_sgprs); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->epilog->config.num_vgprs); + } + } + + si_shader_dump(sscreen, shader, debug, shader->selector->info.processor); + + /* Upload. */ + r = si_shader_binary_upload(sscreen, shader); + if (r) { + fprintf(stderr, "LLVM failed to upload shader\n"); + return r; + } + + return 0; } void si_shader_destroy(struct si_shader *shader) @@ -4249,5 +5991,7 @@ void si_shader_destroy(struct si_shader *shader) r600_resource_reference(&shader->scratch_bo, NULL); r600_resource_reference(&shader->bo, NULL); - si_shader_destroy_binary(&shader->binary); + + if (!shader->is_binary_shared) + radeon_shader_binary_clean(&shader->binary); }