X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.c;h=9183852a8568e65004c6104ed50903fa3a5285cb;hb=754cf171e90cc61d135e7c45f8be319ee2db02a5;hp=ea1a658eddc910cf0d8857e5f5587ed5d58beacf;hpb=e1b21696a3c2c23ba401fab58ef042d31f7f6273;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ea1a658eddc..9183852a856 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -109,9 +109,11 @@ struct si_shader_context LLVMTypeRef i1; LLVMTypeRef i8; LLVMTypeRef i32; + LLVMTypeRef i64; LLVMTypeRef i128; LLVMTypeRef f32; LLVMTypeRef v16i8; + LLVMTypeRef v2i32; LLVMTypeRef v4i32; LLVMTypeRef v4f32; LLVMTypeRef v8i32; @@ -126,8 +128,12 @@ static struct si_shader_context *si_shader_context( static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, - struct tgsi_shader_info *info); + LLVMTargetMachineRef tm); + +/* Ideally pass the sample mask input to the PS epilog as v13, which + * is its usual location, so that the shader doesn't have to add v_mov. + */ +#define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 /* The VS location of the PrimitiveID input is the same in the epilog, * so that the main shader part doesn't have to move it. @@ -208,6 +214,10 @@ static LLVMValueRef unpack_param(struct si_shader_context *ctx, LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn, param); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = bitcast(&ctx->radeon_bld.soa.bld_base, + TGSI_TYPE_UNSIGNED, value); + if (rshift) value = LLVMBuildLShr(gallivm->builder, value, lp_build_const_int32(gallivm, rshift), ""); @@ -872,7 +882,8 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location) static unsigned select_interp_param(struct si_shader_context *ctx, unsigned param) { - if (!ctx->shader->key.ps.prolog.force_persample_interp) + if (!ctx->shader->key.ps.prolog.force_persample_interp || + !ctx->is_monolithic) return param; /* If the shader doesn't use center/centroid, just return the parameter. @@ -1016,6 +1027,7 @@ static void declare_input_fs( unsigned input_index, const struct tgsi_full_declaration *decl) { + struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); struct si_shader *shader = ctx->shader; @@ -1023,6 +1035,26 @@ static void declare_input_fs( LLVMValueRef interp_param = NULL; int interp_param_idx; + /* Get colors from input VGPRs (set by the prolog). */ + if (!ctx->is_monolithic && + decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { + unsigned i = decl->Semantic.Index; + unsigned colors_read = shader->selector->info.colors_read; + unsigned mask = colors_read >> (i * 4); + unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + + (i ? util_bitcount(colors_read & 0xf) : 0); + + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = + mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = + mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = + mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; + radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = + mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; + return; + } + interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, decl->Interp.Location); if (interp_param_idx == -1) @@ -2081,14 +2113,51 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); - LLVMValueRef invocation_id; + LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + rel_patch_id = get_rel_patch_id(ctx); invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); + tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); - si_write_tess_factors(bld_base, - get_rel_patch_id(ctx), - invocation_id, - get_tcs_out_current_patch_data_offset(ctx)); + if (!ctx->is_monolithic) { + /* Return epilog parameters from this function. */ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef ret = ctx->return_value; + LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; + unsigned vgpr; + + /* RW_BUFFERS pointer */ + rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_RW_BUFFERS); + rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); + rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); + rw0 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.zero, ""); + rw1 = LLVMBuildExtractElement(builder, rw_buffers, + bld_base->uint_bld.one, ""); + ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); + ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); + + /* Tess factor buffer soffset is after user SGPRs. */ + tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_TESS_FACTOR_OFFSET); + ret = LLVMBuildInsertValue(builder, ret, tf_soffset, + SI_TCS_NUM_USER_SGPR, ""); + + /* VGPRs */ + rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); + invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); + tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); + + vgpr = SI_TCS_NUM_USER_SGPR + 1; + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + ctx->return_value = ret; + return; + } + + si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset); } static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) @@ -2491,6 +2560,100 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base) si_export_mrt_z(bld_base, depth, stencil, samplemask); } +/** + * Return PS outputs in this order: + * + * v[0:3] = color0.xyzw + * v[4:7] = color1.xyzw + * ... + * vN+0 = Depth + * vN+1 = Stencil + * vN+2 = SampleMask + * vN+3 = SampleMaskIn (used for OpenGL smoothing) + * + * The alpha-ref SGPR is returned via its original location. + */ +static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct si_shader *shader = ctx->shader; + struct lp_build_context *base = &bld_base->base; + struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][2], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][1], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, + ctx->radeon_bld.soa.outputs[i][0], ""); + break; + default: + fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", + semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue(builder, ret, + bitcast(bld_base, TGSI_TYPE_SIGNED, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, + LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); + + ctx->return_value = ret; +} + static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); @@ -2569,13 +2732,12 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) /** * Load an image view, fmask view. or sampler state descriptor. */ -static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef index, enum desc_type type) +static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx, + LLVMValueRef list, LLVMValueRef index, + enum desc_type type) { struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, - SI_PARAM_SAMPLERS); switch (type) { case DESC_IMAGE: @@ -2591,12 +2753,21 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, /* The sampler state is at [12:15]. */ index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), ""); - ptr = LLVMBuildPointerCast(builder, ptr, - const_array(ctx->v4i32, 0), ""); + list = LLVMBuildPointerCast(builder, list, + const_array(ctx->v4i32, 0), ""); break; } - return build_indexed_load_const(ctx, ptr, index); + return build_indexed_load_const(ctx, list, index); +} + +static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, + LLVMValueRef index, enum desc_type type) +{ + LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + + return get_sampler_desc_custom(ctx, list, index, type); } static void tex_fetch_ptrs( @@ -3682,12 +3853,11 @@ static void create_function(struct si_shader_context *ctx) struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v2i32, v3i32; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; LLVMTypeRef returns[16+32*4]; - unsigned i, last_array_pointer, last_sgpr, num_params; + unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs; unsigned num_returns = 0; - v2i32 = LLVMVectorType(ctx->i32, 2); v3i32 = LLVMVectorType(ctx->i32, 3); params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); @@ -3757,6 +3927,15 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_PATCH_ID] = ctx->i32; params[SI_PARAM_REL_IDS] = ctx->i32; num_params = SI_PARAM_REL_IDS+1; + + if (!ctx->is_monolithic) { + /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */ + for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->i32; /* SGPRs */ + + for (i = 0; i < 3; i++) + returns[num_returns++] = ctx->f32; /* VGPRs */ + } break; case TGSI_PROCESSOR_TESS_EVAL: @@ -3805,13 +3984,13 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_ALPHA_REF] = ctx->f32; params[SI_PARAM_PRIM_MASK] = ctx->i32; last_sgpr = SI_PARAM_PRIM_MASK; - params[SI_PARAM_PERSP_SAMPLE] = v2i32; - params[SI_PARAM_PERSP_CENTER] = v2i32; - params[SI_PARAM_PERSP_CENTROID] = v2i32; + params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTER] = ctx->v2i32; + params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32; params[SI_PARAM_PERSP_PULL_MODEL] = v3i32; - params[SI_PARAM_LINEAR_SAMPLE] = v2i32; - params[SI_PARAM_LINEAR_CENTER] = v2i32; - params[SI_PARAM_LINEAR_CENTROID] = v2i32; + params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32; + params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32; params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32; params[SI_PARAM_POS_X_FLOAT] = ctx->f32; params[SI_PARAM_POS_Y_FLOAT] = ctx->f32; @@ -3820,8 +3999,39 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_FRONT_FACE] = ctx->i32; params[SI_PARAM_ANCILLARY] = ctx->i32; params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; - params[SI_PARAM_POS_FIXED_PT] = ctx->f32; + params[SI_PARAM_POS_FIXED_PT] = ctx->i32; num_params = SI_PARAM_POS_FIXED_PT+1; + + if (!ctx->is_monolithic) { + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = + util_bitcount(shader->selector->info.colors_read); + + assert(num_params + num_color_elements <= ARRAY_SIZE(params)); + for (i = 0; i < num_color_elements; i++) + params[num_params++] = ctx->f32; + } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = + num_return_sgprs + + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, + num_return_sgprs + + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->i32; + for (; i < num_returns; i++) + returns[i] = ctx->f32; + } break; default: @@ -3834,6 +4044,21 @@ static void create_function(struct si_shader_context *ctx) si_create_function(ctx, returns, num_returns, params, num_params, last_array_pointer, last_sgpr); + /* Reserve register locations for VGPR inputs the PS prolog may need. */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT && + !ctx->is_monolithic) { + radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, + "InitialPSInputAddr", + S_0286D0_PERSP_SAMPLE_ENA(1) | + S_0286D0_PERSP_CENTER_ENA(1) | + S_0286D0_PERSP_CENTROID_ENA(1) | + S_0286D0_LINEAR_SAMPLE_ENA(1) | + S_0286D0_LINEAR_CENTER_ENA(1) | + S_0286D0_LINEAR_CENTROID_ENA(1) | + S_0286D0_FRONT_FACE_ENA(1) | + S_0286D0_POS_FIXED_PT_ENA(1)); + } + shader->num_input_sgprs = 0; shader->num_input_vgprs = 0; @@ -3995,6 +4220,49 @@ static void preload_ring_buffers(struct si_shader_context *ctx) } } +static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, + LLVMValueRef param_sampler_views, + unsigned param_pos_fixed_pt) +{ + struct lp_build_tgsi_context *bld_base = + &ctx->radeon_bld.soa.bld_base; + struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_emit_data result = {}; + struct tgsi_full_instruction inst = {}; + LLVMValueRef desc, sampler_index, address[2], pix; + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5); + address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5); + + /* Load the sampler view descriptor. */ + sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER); + desc = get_sampler_desc_custom(ctx, param_sampler_views, + sampler_index, DESC_IMAGE); + + /* Load the texel. */ + inst.Instruction.Opcode = TGSI_OPCODE_TXF; + inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */ + result.inst = &inst; + set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF, + inst.Texture.Texture, + desc, NULL, address, ARRAY_SIZE(address), 0xf); + build_tex_intrinsic(&tex_action, bld_base, &result); + + /* Kill the thread accordingly. */ + pix = LLVMBuildExtractElement(gallivm->builder, result.output[0], + lp_build_const_int32(gallivm, 3), ""); + pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix); + pix = LLVMBuildFNeg(gallivm->builder, pix, ""); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + LLVMVoidTypeInContext(gallivm->context), + &pix, 1, 0); +} + void si_shader_binary_read_config(struct radeon_shader_binary *binary, struct si_shader_config *conf, unsigned symbol_offset) @@ -4252,12 +4520,44 @@ static void si_shader_dump_stats(struct si_screen *sscreen, max_simd_waves); } +static const char *si_get_shader_name(struct si_shader *shader, + unsigned processor) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + if (shader->key.vs.as_es) + return "Vertex Shader as ES"; + else if (shader->key.vs.as_ls) + return "Vertex Shader as LS"; + else + return "Vertex Shader as VS"; + case TGSI_PROCESSOR_TESS_CTRL: + return "Tessellation Control Shader"; + case TGSI_PROCESSOR_TESS_EVAL: + if (shader->key.tes.as_es) + return "Tessellation Evaluation Shader as ES"; + else + return "Tessellation Evaluation Shader as VS"; + case TGSI_PROCESSOR_GEOMETRY: + if (shader->gs_copy_shader == NULL) + return "GS Copy Shader as VS"; + else + return "Geometry Shader"; + case TGSI_PROCESSOR_FRAGMENT: + return "Pixel Shader"; + case TGSI_PROCESSOR_COMPUTE: + return "Compute Shader"; + default: + return "Unknown Shader"; + } +} + void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor) { if (r600_can_dump_shader(&sscreen->b, processor) && !(sscreen->b.debug_flags & DBG_NO_ASM)) { - fprintf(stderr, "\n"); + fprintf(stderr, "\n%s:\n", si_get_shader_name(shader, processor)); if (shader->prolog) si_shader_dump_disassembly(&shader->prolog->binary, @@ -4358,7 +4658,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen, outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); - si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm, gsinfo); + si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm); ctx->type = TGSI_PROCESSOR_VERTEX; ctx->is_gs_copy_shader = true; @@ -4482,8 +4782,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) static void si_init_shader_ctx(struct si_shader_context *ctx, struct si_screen *sscreen, struct si_shader *shader, - LLVMTargetMachineRef tm, - struct tgsi_shader_info *info) + LLVMTargetMachineRef tm) { struct lp_build_tgsi_context *bld_base; @@ -4501,15 +4800,18 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context); + ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context); ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128); ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context); ctx->v16i8 = LLVMVectorType(ctx->i8, 16); + ctx->v2i32 = LLVMVectorType(ctx->i32, 2); ctx->v4i32 = LLVMVectorType(ctx->i32, 4); ctx->v4f32 = LLVMVectorType(ctx->f32, 4); ctx->v8i32 = LLVMVectorType(ctx->i32, 8); bld_base = &ctx->radeon_bld.soa.bld_base; - bld_base->info = info; + if (shader && shader->selector) + bld_base->info = &shader->selector->info; bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; @@ -4545,40 +4847,28 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; } -static int si_compile_tgsi_shader(struct si_screen *sscreen, - LLVMTargetMachineRef tm, - struct si_shader *shader, - bool is_monolithic, - struct pipe_debug_callback *debug) +int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; - struct tgsi_token *tokens = sel->tokens; struct si_shader_context ctx; struct lp_build_tgsi_context *bld_base; - struct tgsi_shader_info stipple_shader_info; LLVMModuleRef mod; int r = 0; - bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT && - shader->key.ps.prolog.poly_stipple; - - if (poly_stipple) { - tokens = util_pstipple_create_fragment_shader(tokens, NULL, - SI_POLY_STIPPLE_SAMPLER, - TGSI_FILE_SYSTEM_VALUE); - tgsi_scan_shader(tokens, &stipple_shader_info); - } /* Dump TGSI code before doing TGSI->LLVM conversion in case the * conversion fails. */ if (r600_can_dump_shader(&sscreen->b, sel->info.processor) && !(sscreen->b.debug_flags & DBG_NO_TGSI)) { si_dump_shader_key(sel->type, &shader->key, stderr); - tgsi_dump(tokens, 0); + tgsi_dump(sel->tokens, 0); si_dump_streamout(&sel->so); } - si_init_shader_ctx(&ctx, sscreen, shader, tm, - poly_stipple ? &stipple_shader_info : &sel->info); + si_init_shader_ctx(&ctx, sscreen, shader, tm); ctx.is_monolithic = is_monolithic; shader->uses_instanceid = sel->info.uses_instanceid; @@ -4615,7 +4905,10 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen, break; case TGSI_PROCESSOR_FRAGMENT: ctx.radeon_bld.load_input = declare_input_fs; - bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + if (is_monolithic) + bld_base->emit_epilogue = si_llvm_emit_fs_epilogue; + else + bld_base->emit_epilogue = si_llvm_return_fs_outputs; break; default: assert(!"Unsupported shader type"); @@ -4629,6 +4922,14 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen, preload_streamout_buffers(&ctx); preload_ring_buffers(&ctx); + if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT && + shader->key.ps.prolog.poly_stipple) { + LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn, + SI_PARAM_SAMPLERS); + si_llvm_emit_polygon_stipple(&ctx, views, + SI_PARAM_POS_FIXED_PT); + } + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { int i; for (i = 0; i < 4; i++) { @@ -4638,7 +4939,7 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen, } } - if (!lp_build_tgsi_llvm(bld_base, tokens)) { + if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); goto out; } @@ -4718,8 +5019,6 @@ static int si_compile_tgsi_shader(struct si_screen *sscreen, out: for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++) FREE(ctx.constants[i]); - if (poly_stipple) - tgsi_free_tokens(tokens); return r; } @@ -4802,7 +5101,7 @@ static bool si_compile_vs_prolog(struct si_screen *sscreen, int last_sgpr, num_params, num_returns, i; bool status = true; - si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL); + si_init_shader_ctx(&ctx, sscreen, &shader, tm); ctx.type = TGSI_PROCESSOR_VERTEX; ctx.param_vertex_id = key->vs_prolog.num_input_sgprs; ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3; @@ -4910,7 +5209,7 @@ static bool si_compile_vs_epilog(struct si_screen *sscreen, int num_params, i; bool status = true; - si_init_shader_ctx(&ctx, sscreen, NULL, tm, NULL); + si_init_shader_ctx(&ctx, sscreen, NULL, tm); ctx.type = TGSI_PROCESSOR_VERTEX; /* Declare input VGPRs. */ @@ -5048,28 +5347,610 @@ static bool si_shader_select_tes_parts(struct si_screen *sscreen, &shader->key.tes.epilog); } +/** + * Compile the TCS epilog. This writes tesselation factors to memory based on + * the output primitive type of the tesselator (determined by TES). + */ +static bool si_compile_tcs_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16]; + LLVMValueRef func; + int last_array_pointer, last_sgpr, num_params; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_TESS_CTRL; + shader.key.tcs.epilog = key->tcs_epilog.states; + + /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ + params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS); + last_array_pointer = SI_PARAM_RW_BUFFERS; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; + params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; + params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; + params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32; + last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; + num_params = last_sgpr + 1; + + params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */ + params[num_params++] = ctx.i32; /* invocation ID within the patch */ + params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */ + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + declare_tess_lds(&ctx); + func = ctx.radeon_bld.main_fn; + + si_write_tess_factors(bld_base, + LLVMGetParam(func, last_sgpr + 1), + LLVMGetParam(func, last_sgpr + 2), + LLVMGetParam(func, last_sgpr + 3)); + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ctx.return_value); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Tessellation Control Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) TCS parts (epilog). + */ +static bool si_shader_select_tcs_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + union si_shader_part_key epilog_key; + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.tcs.epilog; + + shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, + &epilog_key, tm, debug, + si_compile_tcs_epilog); + return shader->epilog != NULL; +} + +/** + * Compile the pixel shader prolog. This handles: + * - two-side color selection and interpolation + * - overriding interpolation parameters for the API PS + * - polygon stippling + * + * All preloaded SGPRs and VGPRs are passed through unmodified unless they are + * overriden by other states. (e.g. per-sample interpolation) + * Interpolated colors are stored after the preloaded VGPRs. + */ +static bool si_compile_ps_prolog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + LLVMTypeRef *params; + LLVMValueRef ret, func; + int last_sgpr, num_params, num_returns, i, num_color_channels; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.prolog = key->ps_prolog.states; + + /* Number of inputs + 8 color elements. */ + params = alloca((key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs + 8) * + sizeof(LLVMTypeRef)); + + /* Declare inputs. */ + num_params = 0; + for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) + params[num_params++] = ctx.i32; + last_sgpr = num_params - 1; + + for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) + params[num_params++] = ctx.f32; + + /* Declare outputs (same as inputs + add colors if needed) */ + num_returns = num_params; + num_color_channels = util_bitcount(key->ps_prolog.colors_read); + for (i = 0; i < num_color_channels; i++) + params[num_returns++] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, params, num_returns, params, + num_params, -1, last_sgpr); + func = ctx.radeon_bld.main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx.return_value; + for (i = 0; i < num_params; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); + } + + /* Polygon stippling. */ + if (key->ps_prolog.states.poly_stipple) { + /* POS_FIXED_PT is always last. */ + unsigned pos = key->ps_prolog.num_input_sgprs + + key->ps_prolog.num_input_vgprs - 1; + LLVMValueRef ptr[2], views; + + /* Get the pointer to sampler views. */ + ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS); + ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1); + views = lp_build_gather_values(gallivm, ptr, 2); + views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, ""); + views = LLVMBuildIntToPtr(gallivm->builder, views, + const_array(ctx.v8i32, SI_NUM_SAMPLERS), ""); + + si_llvm_emit_polygon_stipple(&ctx, views, pos); + } + + /* Interpolate colors. */ + for (i = 0; i < 2; i++) { + unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; + unsigned face_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.face_vgpr_index; + LLVMValueRef interp[2], color[4]; + LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; + + if (!writemask) + continue; + + /* If the interpolation qualifier is not CONSTANT (-1). */ + if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { + unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + + key->ps_prolog.color_interp_vgpr_index[i]; + + interp[0] = LLVMGetParam(func, interp_vgpr); + interp[1] = LLVMGetParam(func, interp_vgpr + 1); + interp_ij = lp_build_gather_values(gallivm, interp, 2); + interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij, + ctx.v2i32, ""); + } + + /* Use the absolute location of the input. */ + prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + + if (key->ps_prolog.states.color_two_side) { + face = LLVMGetParam(func, face_vgpr); + face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, ""); + } + + interp_fs_input(&ctx, + key->ps_prolog.color_attr_index[i], + TGSI_SEMANTIC_COLOR, i, + key->ps_prolog.num_interp_inputs, + key->ps_prolog.colors_read, interp_ij, + prim_mask, face, color); + + while (writemask) { + unsigned chan = u_bit_scan(&writemask); + ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan], + num_params++, ""); + } + } + + /* Force per-sample interpolation. */ + if (key->ps_prolog.states.force_persample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_sample[2], linear_sample[2]; + + /* Read PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + persp_sample[i] = LLVMGetParam(func, base + i); + /* Overwrite PERSP_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 2 + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + persp_sample[i], base + 4 + i, ""); + /* Read LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + linear_sample[i] = LLVMGetParam(func, base + 6 + i); + /* Overwrite LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 8 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(gallivm->builder, ret, + linear_sample[i], base + 10 + i, ""); + } + + /* Compile. */ + LLVMBuildRet(gallivm->builder, ret); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Prolog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Compile the pixel shader epilog. This handles everything that must be + * emulated for pixel shader exports. (alpha-test, format conversions, etc) + */ +static bool si_compile_ps_epilog(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct pipe_debug_callback *debug, + struct si_shader_part *out) +{ + union si_shader_part_key *key = &out->key; + struct si_shader shader = {}; + struct si_shader_context ctx; + struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base; + LLVMTypeRef params[16+8*4+3]; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int last_array_pointer, last_sgpr, num_params, i; + bool status = true; + + si_init_shader_ctx(&ctx, sscreen, &shader, tm); + ctx.type = TGSI_PROCESSOR_FRAGMENT; + shader.key.ps.epilog = key->ps_epilog.states; + + /* Declare input SGPRs. */ + params[SI_PARAM_RW_BUFFERS] = ctx.i64; + params[SI_PARAM_CONST_BUFFERS] = ctx.i64; + params[SI_PARAM_SAMPLERS] = ctx.i64; + params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_ALPHA_REF] = ctx.f32; + last_array_pointer = -1; + last_sgpr = SI_PARAM_ALPHA_REF; + + /* Declare input VGPRs. */ + num_params = (last_sgpr + 1) + + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + + key->ps_epilog.writes_stencil + + key->ps_epilog.writes_samplemask; + + num_params = MAX2(num_params, + last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + assert(num_params <= ARRAY_SIZE(params)); + + for (i = last_sgpr + 1; i < num_params; i++) + params[i] = ctx.f32; + + /* Create the function. */ + si_create_function(&ctx, NULL, 0, params, num_params, + last_array_pointer, last_sgpr); + /* Disable elimination of unused inputs. */ + radeon_llvm_add_attribute(ctx.radeon_bld.main_fn, + "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = last_sgpr + 1; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && + !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & + ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && + (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + si_export_mrt_color(bld_base, color, mrt, + num_params - 1, + mrt == last_color_export); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(bld_base, depth, stencil, samplemask); + else if (last_color_export == -1) + si_export_null(bld_base); + + /* Compile. */ + LLVMBuildRetVoid(gallivm->builder); + radeon_llvm_finalize_module(&ctx.radeon_bld); + + if (si_compile_llvm(sscreen, &out->binary, &out->config, tm, + gallivm->module, debug, ctx.type, + "Fragment Shader Epilog")) + status = false; + + radeon_llvm_dispose(&ctx.radeon_bld); + return status; +} + +/** + * Select and compile (or reuse) pixel shader parts (prolog & epilog). + */ +static bool si_shader_select_ps_parts(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + struct pipe_debug_callback *debug) +{ + struct tgsi_shader_info *info = &shader->selector->info; + union si_shader_part_key prolog_key; + union si_shader_part_key epilog_key; + unsigned i; + + /* Get the prolog. */ + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.ps_prolog.states = shader->key.ps.prolog; + prolog_key.ps_prolog.colors_read = info->colors_read; + prolog_key.ps_prolog.num_input_sgprs = shader->num_input_sgprs; + prolog_key.ps_prolog.num_input_vgprs = shader->num_input_vgprs; + + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; + + if (shader->key.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + prolog_key.ps_prolog.num_interp_inputs = info->num_inputs; + prolog_key.ps_prolog.face_vgpr_index = shader->face_vgpr_index; + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + } + + for (i = 0; i < 2; i++) { + unsigned location = info->input_interpolate_loc[color[i]]; + + if (!(info->colors_read & (0xf << i*4))) + continue; + + prolog_key.ps_prolog.color_attr_index[i] = color[i]; + + /* Force per-sample interpolation for the colors here. */ + if (shader->key.ps.prolog.force_persample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + + switch (info->input_interpolate[color[i]]) { + case TGSI_INTERPOLATE_CONSTANT: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4; + shader->config.spi_ps_input_ena |= + S_0286CC_PERSP_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_SAMPLE_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTER: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTER_ENA(1); + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10; + shader->config.spi_ps_input_ena |= + S_0286CC_LINEAR_CENTROID_ENA(1); + break; + default: + assert(0); + } + break; + default: + assert(0); + } + } + } + + /* The prolog is a no-op if these aren't set. */ + if (prolog_key.ps_prolog.colors_read || + prolog_key.ps_prolog.states.force_persample_interp || + prolog_key.ps_prolog.states.poly_stipple) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->ps_prologs, + &prolog_key, tm, debug, + si_compile_ps_prolog); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.ps_epilog.colors_written = info->colors_written; + epilog_key.ps_epilog.writes_z = info->writes_z; + epilog_key.ps_epilog.writes_stencil = info->writes_stencil; + epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask; + epilog_key.ps_epilog.states = shader->key.ps.epilog; + + shader->epilog = + si_get_shader_part(sscreen, &sscreen->ps_epilogs, + &epilog_key, tm, debug, + si_compile_ps_epilog); + if (!shader->epilog) + return false; + + /* Enable POS_FIXED_PT if polygon stippling is enabled. */ + if (shader->key.ps.prolog.poly_stipple) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); + } + + /* Set up the enable bits for per-sample shading if needed. */ + if (shader->key.ps.prolog.force_persample_interp) { + if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + } + + /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && + !(shader->config.spi_ps_input_ena & 0xf)) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* At least one pair of interpolation weights must be enabled. */ + if (!(shader->config.spi_ps_input_ena & 0x7f)) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* The sample mask input is always enabled, because the API shader always + * passes it through to the epilog. Disable it here if it's unused. + */ + if (!shader->key.ps.epilog.poly_line_smoothing && + !shader->selector->info.reads_samplemask) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + + return true; +} + int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *shader, struct pipe_debug_callback *debug) { + struct si_shader *mainp = shader->selector->main_shader_part; int r; - /* Compile TGSI. */ - r = si_compile_tgsi_shader(sscreen, tm, shader, - sscreen->use_monolithic_shaders, debug); - if (r) - return r; + /* LS and ES are always compiled on demand. */ + if (!mainp || + (shader->selector->type == PIPE_SHADER_VERTEX && + (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (shader->selector->type == PIPE_SHADER_TESS_EVAL && + shader->key.tes.as_es)) { + /* Monolithic shader (compiled as a whole, has many variants, + * may take a long time to compile). + */ + r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug); + if (r) + return r; + } else { + /* The shader consists of 2-3 parts: + * + * - the middle part is the user shader, it has 1 variant only + * and it was compiled during the creation of the shader + * selector + * - the prolog part is inserted at the beginning + * - the epilog part is inserted at the end + * + * The prolog and epilog have many (but simple) variants. + */ - if (!sscreen->use_monolithic_shaders) { + /* Copy the compiled TGSI shader data over. */ + shader->is_binary_shared = true; + shader->binary = mainp->binary; + shader->config = mainp->config; + shader->num_input_sgprs = mainp->num_input_sgprs; + shader->num_input_vgprs = mainp->num_input_vgprs; + shader->face_vgpr_index = mainp->face_vgpr_index; + memcpy(shader->vs_output_param_offset, + mainp->vs_output_param_offset, + sizeof(mainp->vs_output_param_offset)); + shader->uses_instanceid = mainp->uses_instanceid; + shader->nr_pos_exports = mainp->nr_pos_exports; + shader->nr_param_exports = mainp->nr_param_exports; + + /* Select prologs and/or epilogs. */ switch (shader->selector->type) { case PIPE_SHADER_VERTEX: if (!si_shader_select_vs_parts(sscreen, tm, shader, debug)) return -1; break; + case PIPE_SHADER_TESS_CTRL: + if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug)) + return -1; + break; case PIPE_SHADER_TESS_EVAL: if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) return -1; break; + case PIPE_SHADER_FRAGMENT: + if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) + return -1; + + /* Make sure we have at least as many VGPRs as there + * are allocated inputs. + */ + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, + shader->num_input_vgprs); + break; } /* Update SGPR and VGPR counts. */ @@ -5111,5 +5992,6 @@ void si_shader_destroy(struct si_shader *shader) r600_resource_reference(&shader->bo, NULL); - radeon_shader_binary_clean(&shader->binary); + if (!shader->is_binary_shared) + radeon_shader_binary_clean(&shader->binary); }