- for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 11; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
- break;
-
- case SI_SHADER_MERGED_VERTEX_TESSCTRL:
- /* Merged stages have 8 system SGPRs at the beginning. */
- /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
- declare_per_stage_desc_pointers(ctx,
- ctx->type == PIPE_SHADER_TESS_CTRL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
-
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx,
- ctx->type == PIPE_SHADER_VERTEX);
- declare_vs_specific_input_sgprs(ctx);
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
- declare_vb_descriptor_input_sgprs(ctx);
-
- /* VGPRs (first TCS, then VS) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
-
- /* LS return values are inputs to the TCS main shader part. */
- for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 2; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
- } else {
- /* TCS return values are inputs to the TCS epilog.
- *
- * param_tcs_offchip_offset, param_tcs_factor_offset,
- * param_tcs_offchip_layout, and param_rw_buffers
- * should be passed to the epilog.
- */
- for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 11; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
- }
- break;
-
- case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
- /* Merged stages have 8 system SGPRs at the beginning. */
- /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
- declare_per_stage_desc_pointers(ctx,
- ctx->type == PIPE_SHADER_GEOMETRY);
-
- if (ctx->shader->key.as_ngg)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
- else
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
-
- declare_global_desc_pointers(ctx);
- if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
- declare_per_stage_desc_pointers(ctx,
- (ctx->type == PIPE_SHADER_VERTEX ||
- ctx->type == PIPE_SHADER_TESS_EVAL));
- }
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- if (vs_blit_property)
- declare_vs_blit_inputs(ctx, vs_blit_property);
- else
- declare_vs_specific_input_sgprs(ctx);
- } else {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
- /* Declare as many input SGPRs as the VS has. */
- }
-
- if (ctx->type == PIPE_SHADER_VERTEX)
- declare_vb_descriptor_input_sgprs(ctx);
-
- /* VGPRs (first GS, then VS/TES) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
- } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- declare_tes_input_vgprs(ctx);
- }
-
- if (ctx->shader->key.as_es &&
- (ctx->type == PIPE_SHADER_VERTEX ||
- ctx->type == PIPE_SHADER_TESS_EVAL)) {
- unsigned num_user_sgprs;
-
- if (ctx->type == PIPE_SHADER_VERTEX)
- num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
- else
- num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
- /* ES return values are inputs to GS. */
- for (i = 0; i < 8 + num_user_sgprs; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 5; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
- }
- break;
-
- case PIPE_SHADER_TESS_EVAL:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
-
- if (shader->key.as_es) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
- } else {
- declare_streamout_params(ctx, &shader->selector->so);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- }
-
- /* VGPRs */
- declare_tes_input_vgprs(ctx);
- break;
-
- case PIPE_SHADER_GEOMETRY:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
-
- /* VGPRs */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
- break;
-
- case PIPE_SHADER_FRAGMENT:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
- SI_PARAM_ALPHA_REF);
- add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
-
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
- SI_PARAM_PERSP_SAMPLE);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
- NULL, SI_PARAM_PERSP_PULL_MODEL);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
- NULL, SI_PARAM_LINE_STIPPLE_TEX);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
- shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->args.front_face, SI_PARAM_FRONT_FACE);
- shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->args.ancillary, SI_PARAM_ANCILLARY);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
- add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
-
- /* Color inputs from the prolog. */
- if (shader->selector->info.colors_read) {
- unsigned num_color_elements =
- util_bitcount(shader->selector->info.colors_read);
-
- for (i = 0; i < num_color_elements; i++)
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
- num_prolog_vgprs += num_color_elements;
- }
-
- /* Outputs for the epilog. */
- num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
- num_returns =
- num_return_sgprs +
- util_bitcount(shader->selector->info.colors_written) * 4 +
- shader->selector->info.writes_z +
- shader->selector->info.writes_stencil +
- shader->selector->info.writes_samplemask +
- 1 /* SampleMaskIn */;
-
- num_returns = MAX2(num_returns,
- num_return_sgprs +
- PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
- for (i = 0; i < num_return_sgprs; i++)
- returns[i] = ctx->i32;
- for (; i < num_returns; i++)
- returns[i] = ctx->f32;
- break;
-
- case PIPE_SHADER_COMPUTE:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- if (shader->selector->info.uses_grid_size)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
- &ctx->args.num_work_groups);
- if (shader->selector->info.uses_block_size &&
- shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
-
- unsigned cs_user_data_dwords =
- shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
- if (cs_user_data_dwords) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
- &ctx->cs_user_data);
- }
-
- /* Hardware SGPRs. */
- for (i = 0; i < 3; i++) {
- if (shader->selector->info.uses_block_id[i]) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->args.workgroup_ids[i]);
- }
- }
- if (shader->selector->info.uses_subgroup_info)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
-
- /* Hardware VGPRs. */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
- &ctx->args.local_invocation_ids);
- break;
- default:
- assert(0 && "unimplemented shader");
- return;
- }
-
- si_create_function(ctx, "main", returns, num_returns,
- si_get_max_workgroup_size(shader));
-
- /* Reserve register locations for VGPR inputs the PS prolog may need. */
- if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn,
- "InitialPSInputAddr",
- S_0286D0_PERSP_SAMPLE_ENA(1) |
- S_0286D0_PERSP_CENTER_ENA(1) |
- S_0286D0_PERSP_CENTROID_ENA(1) |
- S_0286D0_LINEAR_SAMPLE_ENA(1) |
- S_0286D0_LINEAR_CENTER_ENA(1) |
- S_0286D0_LINEAR_CENTROID_ENA(1) |
- S_0286D0_FRONT_FACE_ENA(1) |
- S_0286D0_ANCILLARY_ENA(1) |
- S_0286D0_POS_FIXED_PT_ENA(1));
- }
-
- shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
- shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
-
- assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
- shader->info.num_input_vgprs -= num_prolog_vgprs;
-
- if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
- if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
- /* The LSHS size is not known until draw time, so we append it
- * at the end of whatever LDS use there may be in the rest of
- * the shader (currently none, unless LLVM decides to do its
- * own LDS-based lowering).
- */
- ctx->ac.lds = LLVMAddGlobalInAddressSpace(
- ctx->ac.module, LLVMArrayType(ctx->i32, 0),
- "__lds_end", AC_ADDR_SPACE_LDS);
- LLVMSetAlignment(ctx->ac.lds, 256);
- } else {
- ac_declare_lds_as_pointer(&ctx->ac);
- }
- }
-
- /* Unlike radv, we override these arguments in the prolog, so to the
- * API shader they appear as normal arguments.
- */
- if (ctx->type == PIPE_SHADER_VERTEX) {
- ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
- ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
- } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
- ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
- ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
- }
-}
-
-/* Ensure that the esgs ring is declared.
- *
- * We declare it with 64KB alignment as a hint that the
- * pointer value will always be 0.
- */
-static void declare_esgs_ring(struct si_shader_context *ctx)
-{
- if (ctx->esgs_ring)
- return;
-
- assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
-
- ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
- ctx->ac.module, LLVMArrayType(ctx->i32, 0),
- "esgs_ring",
- AC_ADDR_SPACE_LDS);
- LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
- LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
-}
-
-/**
- * Load ESGS and GSVS ring buffer resource descriptors and save the variables
- * for later use.
- */
-static void preload_ring_buffers(struct si_shader_context *ctx)
-{
- LLVMBuilderRef builder = ctx->ac.builder;
-
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-
- if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) {
- if (ctx->screen->info.chip_class <= GFX8) {
- unsigned ring =
- ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
- : SI_ES_RING_ESGS;
- LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
-
- ctx->esgs_ring =
- ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
- } else {
- if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
- /* Declare the ESGS ring as an explicit LDS symbol. */
- declare_esgs_ring(ctx);
- } else {
- ac_declare_lds_as_pointer(&ctx->ac);
- ctx->esgs_ring = ctx->ac.lds;
- }
- }
- }
-
- if (ctx->shader->is_gs_copy_shader) {
- LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
-
- ctx->gsvs_ring[0] =
- ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
- } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
- const struct si_shader_selector *sel = ctx->shader->selector;
- LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
- LLVMValueRef base_ring;
-
- base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
- /* The conceptual layout of the GSVS ring is
- * v0c0 .. vLv0 v0c1 .. vLc1 ..
- * but the real memory layout is swizzled across
- * threads:
- * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
- * t16v0c0 ..
- * Override the buffer descriptor accordingly.
- */
- LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
- uint64_t stream_offset = 0;
-
- for (unsigned stream = 0; stream < 4; ++stream) {
- unsigned num_components;
- unsigned stride;
- unsigned num_records;
- LLVMValueRef ring, tmp;
-
- num_components = sel->info.num_stream_output_components[stream];
- if (!num_components)
- continue;
-
- stride = 4 * num_components * sel->gs_max_out_vertices;
-
- /* Limit on the stride field for <= GFX7. */
- assert(stride < (1 << 14));
-
- num_records = ctx->ac.wave_size;
-
- ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
- tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
- tmp = LLVMBuildAdd(builder, tmp,
- LLVMConstInt(ctx->i64,
- stream_offset, 0), "");
- stream_offset += stride * ctx->ac.wave_size;
-
- ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
- ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
- tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
- tmp = LLVMBuildOr(builder, tmp,
- LLVMConstInt(ctx->i32,
- S_008F04_STRIDE(stride) |
- S_008F04_SWIZZLE_ENABLE(1), 0), "");
- ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
- ring = LLVMBuildInsertElement(builder, ring,
- LLVMConstInt(ctx->i32, num_records, 0),
- LLVMConstInt(ctx->i32, 2, 0), "");
-
- uint32_t rsrc3 =
- S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
- S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
- S_008F0C_ADD_TID_ENABLE(1);
-
- if (ctx->ac.chip_class >= GFX10) {
- rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
- S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
- }
-
- ring = LLVMBuildInsertElement(builder, ring,
- LLVMConstInt(ctx->i32, rsrc3, false),
- LLVMConstInt(ctx->i32, 3, 0), "");
-
- ctx->gsvs_ring[stream] = ring;
- }
- } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
- }
-}
-
-static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
- LLVMValueRef param_rw_buffers,
- struct ac_arg param_pos_fixed_pt)
-{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef slot, desc, offset, row, bit, address[2];
-
- /* Use the fixed-point gl_FragCoord input.
- * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
- * per coordinate to get the repeating effect.
- */
- address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
- address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
-
- /* Load the buffer descriptor. */
- slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
- desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
-
- /* The stipple pattern is 32x32, each row has 32 bits. */
- offset = LLVMBuildMul(builder, address[1],
- LLVMConstInt(ctx->i32, 4, 0), "");
- row = buffer_load_const(ctx, desc, offset);
- row = ac_to_integer(&ctx->ac, row);
- bit = LLVMBuildLShr(builder, row, address[0], "");
- bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
- ac_build_kill_if_false(&ctx->ac, bit);
-}
-
-/* For the UMR disassembler. */
-#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
-#define DEBUGGER_NUM_MARKERS 5
-
-static bool si_shader_binary_open(struct si_screen *screen,
- struct si_shader *shader,
- struct ac_rtld_binary *rtld)
-{
- const struct si_shader_selector *sel = shader->selector;
- const char *part_elfs[5];
- size_t part_sizes[5];
- unsigned num_parts = 0;
-
-#define add_part(shader_or_part) \
- if (shader_or_part) { \
- part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
- part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
- num_parts++; \
- }
-
- add_part(shader->prolog);
- add_part(shader->previous_stage);
- add_part(shader->prolog2);
- add_part(shader);
- add_part(shader->epilog);
-
-#undef add_part
-
- struct ac_rtld_symbol lds_symbols[2];
- unsigned num_lds_symbols = 0;
-
- if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
- (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
- /* We add this symbol even on LLVM <= 8 to ensure that
- * shader->config.lds_size is set correctly below.
- */
- struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
- sym->name = "esgs_ring";
- sym->size = shader->gs_info.esgs_ring_size;
- sym->align = 64 * 1024;
- }
-
- if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
- struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
- sym->name = "ngg_emit";
- sym->size = shader->ngg.ngg_emit_size * 4;
- sym->align = 4;
- }
-
- bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
- .info = &screen->info,
- .options = {
- .halt_at_entry = screen->options.halt_shaders,
- },
- .shader_type = tgsi_processor_to_shader_stage(sel->type),
- .wave_size = si_get_shader_wave_size(shader),
- .num_parts = num_parts,
- .elf_ptrs = part_elfs,
- .elf_sizes = part_sizes,
- .num_shared_lds_symbols = num_lds_symbols,
- .shared_lds_symbols = lds_symbols });
-
- if (rtld->lds_size > 0) {
- unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
- shader->config.lds_size =
- align(rtld->lds_size, alloc_granularity) / alloc_granularity;
- }
-
- return ok;
-}
-
-static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
-{
- struct ac_rtld_binary rtld;
- si_shader_binary_open(screen, shader, &rtld);
- return rtld.exec_size;
-}
-
-static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
-{
- uint64_t *scratch_va = data;
-
- if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
- *value = (uint32_t)*scratch_va;
- return true;
- }
- if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
- /* Enable scratch coalescing. */
- *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
- S_008F04_SWIZZLE_ENABLE(1);
- return true;
- }
-
- return false;
-}
-
-bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
- uint64_t scratch_va)
-{
- struct ac_rtld_binary binary;
- if (!si_shader_binary_open(sscreen, shader, &binary))
- return false;
-
- si_resource_reference(&shader->bo, NULL);
- shader->bo = si_aligned_buffer_create(&sscreen->b,
- sscreen->info.cpdma_prefetch_writes_memory ?
- 0 : SI_RESOURCE_FLAG_READ_ONLY,
- PIPE_USAGE_IMMUTABLE,
- align(binary.rx_size, SI_CPDMA_ALIGNMENT),
- 256);
- if (!shader->bo)
- return false;
-
- /* Upload. */
- struct ac_rtld_upload_info u = {};
- u.binary = &binary;
- u.get_external_symbol = si_get_external_symbol;
- u.cb_data = &scratch_va;
- u.rx_va = shader->bo->gpu_address;
- u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
- PIPE_TRANSFER_READ_WRITE |
- PIPE_TRANSFER_UNSYNCHRONIZED |
- RADEON_TRANSFER_TEMPORARY);
- if (!u.rx_ptr)
- return false;
-
- bool ok = ac_rtld_upload(&u);
-
- sscreen->ws->buffer_unmap(shader->bo->buf);
- ac_rtld_close(&binary);
-
- return ok;
-}
-
-static void si_shader_dump_disassembly(struct si_screen *screen,
- const struct si_shader_binary *binary,
- enum pipe_shader_type shader_type,
- unsigned wave_size,
- struct pipe_debug_callback *debug,
- const char *name, FILE *file)
-{
- struct ac_rtld_binary rtld_binary;
-
- if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
- .info = &screen->info,
- .shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = wave_size,
- .num_parts = 1,
- .elf_ptrs = &binary->elf_buffer,
- .elf_sizes = &binary->elf_size }))
- return;
-
- const char *disasm;
- size_t nbytes;
-
- if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
- goto out;
-
- if (nbytes > INT_MAX)
- goto out;
-
- if (debug && debug->debug_message) {
- /* Very long debug messages are cut off, so send the
- * disassembly one line at a time. This causes more
- * overhead, but on the plus side it simplifies
- * parsing of resulting logs.
- */
- pipe_debug_message(debug, SHADER_INFO,
- "Shader Disassembly Begin");
-
- uint64_t line = 0;
- while (line < nbytes) {
- int count = nbytes - line;
- const char *nl = memchr(disasm + line, '\n', nbytes - line);
- if (nl)
- count = nl - (disasm + line);
-
- if (count) {
- pipe_debug_message(debug, SHADER_INFO,
- "%.*s", count, disasm + line);
- }
-
- line += count + 1;
- }
-
- pipe_debug_message(debug, SHADER_INFO,
- "Shader Disassembly End");
- }
-
- if (file) {
- fprintf(file, "Shader %s disassembly:\n", name);
- fprintf(file, "%*s", (int)nbytes, disasm);
- }
-
-out:
- ac_rtld_close(&rtld_binary);
-}
-
-static void si_calculate_max_simd_waves(struct si_shader *shader)
-{
- struct si_screen *sscreen = shader->selector->screen;
- struct ac_shader_config *conf = &shader->config;
- unsigned num_inputs = shader->selector->info.num_inputs;
- unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
- unsigned lds_per_wave = 0;
- unsigned max_simd_waves;
-
- max_simd_waves = sscreen->info.max_wave64_per_simd;
-
- /* Compute LDS usage for PS. */
- switch (shader->selector->type) {
- case PIPE_SHADER_FRAGMENT:
- /* The minimum usage per wave is (num_inputs * 48). The maximum
- * usage is (num_inputs * 48 * 16).
- * We can get anything in between and it varies between waves.
- *
- * The 48 bytes per input for a single primitive is equal to
- * 4 bytes/component * 4 components/input * 3 points.
- *
- * Other stages don't know the size at compile time or don't
- * allocate LDS per wave, but instead they do it per thread group.
- */
- lds_per_wave = conf->lds_size * lds_increment +
- align(num_inputs * 48, lds_increment);
- break;
- case PIPE_SHADER_COMPUTE:
- if (shader->selector) {
- unsigned max_workgroup_size =
- si_get_max_workgroup_size(shader);
- lds_per_wave = (conf->lds_size * lds_increment) /
- DIV_ROUND_UP(max_workgroup_size,
- sscreen->compute_wave_size);
- }
- break;
- default:;
- }
-
- /* Compute the per-SIMD wave counts. */
- if (conf->num_sgprs) {
- max_simd_waves =
- MIN2(max_simd_waves,
- sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
- }
-
- if (conf->num_vgprs) {
- /* Always print wave limits as Wave64, so that we can compare
- * Wave32 and Wave64 with shader-db fairly. */
- unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
- max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
- }
-
- /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
- * 16KB makes some SIMDs unoccupied).
- *
- * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
- */
- unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
- unsigned max_lds_per_simd = max_lds_size / 4;
- if (lds_per_wave)
- max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
-
- shader->info.max_simd_waves = max_simd_waves;
-}
-
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
-{
- const struct ac_shader_config *conf = &shader->config;
-
- if (screen->options.debug_disassembly)
- si_shader_dump_disassembly(screen, &shader->binary,
- shader->selector->type,
- si_get_shader_wave_size(shader),
- debug, "main", NULL);
-
- pipe_debug_message(debug, SHADER_INFO,
- "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
- "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
- "Spilled VGPRs: %d PrivMem VGPRs: %d",
- conf->num_sgprs, conf->num_vgprs,
- si_get_shader_binary_size(screen, shader),
- conf->lds_size, conf->scratch_bytes_per_wave,
- shader->info.max_simd_waves, conf->spilled_sgprs,
- conf->spilled_vgprs, shader->info.private_mem_vgprs);
-}
-
-static void si_shader_dump_stats(struct si_screen *sscreen,
- struct si_shader *shader,
- FILE *file,
- bool check_debug_option)
-{
- const struct ac_shader_config *conf = &shader->config;
-
- if (!check_debug_option ||
- si_can_dump_shader(sscreen, shader->selector->type)) {
- if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
- fprintf(file, "*** SHADER CONFIG ***\n"
- "SPI_PS_INPUT_ADDR = 0x%04x\n"
- "SPI_PS_INPUT_ENA = 0x%04x\n",
- conf->spi_ps_input_addr, conf->spi_ps_input_ena);
- }
-
- fprintf(file, "*** SHADER STATS ***\n"
- "SGPRS: %d\n"
- "VGPRS: %d\n"
- "Spilled SGPRs: %d\n"
- "Spilled VGPRs: %d\n"
- "Private memory VGPRs: %d\n"
- "Code Size: %d bytes\n"
- "LDS: %d blocks\n"
- "Scratch: %d bytes per wave\n"
- "Max Waves: %d\n"
- "********************\n\n\n",
- conf->num_sgprs, conf->num_vgprs,
- conf->spilled_sgprs, conf->spilled_vgprs,
- shader->info.private_mem_vgprs,
- si_get_shader_binary_size(sscreen, shader),
- conf->lds_size, conf->scratch_bytes_per_wave,
- shader->info.max_simd_waves);
- }
-}
-
-const char *si_get_shader_name(const struct si_shader *shader)
-{
- switch (shader->selector->type) {
- case PIPE_SHADER_VERTEX:
- if (shader->key.as_es)
- return "Vertex Shader as ES";
- else if (shader->key.as_ls)
- return "Vertex Shader as LS";
- else if (shader->key.opt.vs_as_prim_discard_cs)
- return "Vertex Shader as Primitive Discard CS";
- else if (shader->key.as_ngg)
- return "Vertex Shader as ESGS";
- else
- return "Vertex Shader as VS";
- case PIPE_SHADER_TESS_CTRL:
- return "Tessellation Control Shader";
- case PIPE_SHADER_TESS_EVAL:
- if (shader->key.as_es)
- return "Tessellation Evaluation Shader as ES";
- else if (shader->key.as_ngg)
- return "Tessellation Evaluation Shader as ESGS";
- else
- return "Tessellation Evaluation Shader as VS";
- case PIPE_SHADER_GEOMETRY:
- if (shader->is_gs_copy_shader)
- return "GS Copy Shader as VS";
- else
- return "Geometry Shader";
- case PIPE_SHADER_FRAGMENT:
- return "Pixel Shader";
- case PIPE_SHADER_COMPUTE:
- return "Compute Shader";
- default:
- return "Unknown Shader";
- }
-}
-
-void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
- struct pipe_debug_callback *debug,
- FILE *file, bool check_debug_option)
-{
- enum pipe_shader_type shader_type = shader->selector->type;
-
- if (!check_debug_option ||
- si_can_dump_shader(sscreen, shader_type))
- si_dump_shader_key(shader, file);
-
- if (!check_debug_option && shader->binary.llvm_ir_string) {
- if (shader->previous_stage &&
- shader->previous_stage->binary.llvm_ir_string) {
- fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
- si_get_shader_name(shader));
- fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
- }
-
- fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
- si_get_shader_name(shader));
- fprintf(file, "%s\n", shader->binary.llvm_ir_string);
- }
-
- if (!check_debug_option ||
- (si_can_dump_shader(sscreen, shader_type) &&
- !(sscreen->debug_flags & DBG(NO_ASM)))) {
- unsigned wave_size = si_get_shader_wave_size(shader);
-
- fprintf(file, "\n%s:\n", si_get_shader_name(shader));
-
- if (shader->prolog)
- si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
- shader_type, wave_size, debug, "prolog", file);
- if (shader->previous_stage)
- si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
- shader_type, wave_size, debug, "previous stage", file);
- if (shader->prolog2)
- si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
- shader_type, wave_size, debug, "prolog2", file);
-
- si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
- wave_size, debug, "main", file);
-
- if (shader->epilog)
- si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
- shader_type, wave_size, debug, "epilog", file);
- fprintf(file, "\n");
- }
-
- si_shader_dump_stats(sscreen, shader, file, check_debug_option);
-}
-
-static int si_compile_llvm(struct si_screen *sscreen,
- struct si_shader_binary *binary,
- struct ac_shader_config *conf,
- struct ac_llvm_compiler *compiler,
- LLVMModuleRef mod,
- struct pipe_debug_callback *debug,
- enum pipe_shader_type shader_type,
- unsigned wave_size,
- const char *name,
- bool less_optimized)
-{
- unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
-
- if (si_can_dump_shader(sscreen, shader_type)) {
- fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
-
- if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
- fprintf(stderr, "%s LLVM IR:\n\n", name);
- ac_dump_module(mod);
- fprintf(stderr, "\n");
- }
- }
-
- if (sscreen->record_llvm_ir) {
- char *ir = LLVMPrintModuleToString(mod);
- binary->llvm_ir_string = strdup(ir);
- LLVMDisposeMessage(ir);
- }
-
- if (!si_replace_shader(count, binary)) {
- unsigned r = si_llvm_compile(mod, binary, compiler, debug,
- less_optimized, wave_size);
- if (r)
- return r;
- }
-
- struct ac_rtld_binary rtld;
- if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
- .info = &sscreen->info,
- .shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = wave_size,
- .num_parts = 1,
- .elf_ptrs = &binary->elf_buffer,
- .elf_sizes = &binary->elf_size }))
- return -1;
-
- bool ok = ac_rtld_read_config(&rtld, conf);
- ac_rtld_close(&rtld);
- if (!ok)
- return -1;
-
- /* Enable 64-bit and 16-bit denormals, because there is no performance
- * cost.
- *
- * If denormals are enabled, all floating-point output modifiers are
- * ignored.
- *
- * Don't enable denormals for 32-bit floats, because:
- * - Floating-point output modifiers would be ignored by the hw.
- * - Some opcodes don't support denormals, such as v_mad_f32. We would
- * have to stop using those.
- * - GFX6 & GFX7 would be very slow.
- */
- conf->float_mode |= V_00B028_FP_64_DENORMS;
-
- return 0;
-}
-
-static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
-{
- if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
- LLVMBuildRetVoid(ctx->ac.builder);
- else
- LLVMBuildRet(ctx->ac.builder, ret);
-}
-
-/* Generate code for the hardware VS shader stage to go with a geometry shader */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader_selector *gs_selector,
- struct pipe_debug_callback *debug)
-{
- struct si_shader_context ctx;
- struct si_shader *shader;
- LLVMBuilderRef builder;
- struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
- struct tgsi_shader_info *gsinfo = &gs_selector->info;
- int i;
-
-
- shader = CALLOC_STRUCT(si_shader);
- if (!shader)
- return NULL;
-
- /* We can leave the fence as permanently signaled because the GS copy
- * shader only becomes visible globally after it has been compiled. */
- util_queue_fence_init(&shader->ready);
-
- shader->selector = gs_selector;
- shader->is_gs_copy_shader = true;
-
- si_llvm_context_init(&ctx, sscreen, compiler,
- si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false),
- 64);
- ctx.shader = shader;
- ctx.type = PIPE_SHADER_VERTEX;
-
- builder = ctx.ac.builder;
-
- create_function(&ctx);
- preload_ring_buffers(&ctx);
-
- LLVMValueRef voffset =
- LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
- LLVMConstInt(ctx.i32, 4, 0), "");
-
- /* Fetch the vertex stream ID.*/
- LLVMValueRef stream_id;
-
- if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
- stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
- else
- stream_id = ctx.i32_0;
-
- /* Fill in output information. */
- for (i = 0; i < gsinfo->num_outputs; ++i) {
- outputs[i].semantic_name = gsinfo->output_semantic_name[i];
- outputs[i].semantic_index = gsinfo->output_semantic_index[i];
-
- for (int chan = 0; chan < 4; chan++) {
- outputs[i].vertex_stream[chan] =
- (gsinfo->output_streams[i] >> (2 * chan)) & 3;
- }
- }
-
- LLVMBasicBlockRef end_bb;
- LLVMValueRef switch_inst;
-
- end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
- switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
-
- for (int stream = 0; stream < 4; stream++) {
- LLVMBasicBlockRef bb;
- unsigned offset;
-
- if (!gsinfo->num_stream_output_components[stream])
- continue;
-
- if (stream > 0 && !gs_selector->so.num_outputs)
- continue;
-
- bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
- LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
- LLVMPositionBuilderAtEnd(builder, bb);
-
- /* Fetch vertex data from GSVS ring */
- offset = 0;
- for (i = 0; i < gsinfo->num_outputs; ++i) {
- for (unsigned chan = 0; chan < 4; chan++) {
- if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
- outputs[i].vertex_stream[chan] != stream) {
- outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
- continue;
- }
-
- LLVMValueRef soffset = LLVMConstInt(ctx.i32,
- offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
- offset++;
-
- outputs[i].values[chan] =
- ac_build_buffer_load(&ctx.ac,
- ctx.gsvs_ring[0], 1,
- ctx.i32_0, voffset,
- soffset, 0, ac_glc | ac_slc,
- true, false);
- }
- }
-
- /* Streamout and exports. */
- if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
- si_llvm_emit_streamout(&ctx, outputs,
- gsinfo->num_outputs,
- stream);
- }
-
- if (stream == 0)
- si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
-
- LLVMBuildBr(builder, end_bb);
- }
-
- LLVMPositionBuilderAtEnd(builder, end_bb);
-
- LLVMBuildRetVoid(ctx.ac.builder);
-
- ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
- si_llvm_optimize_module(&ctx);
-
- bool ok = false;
- if (si_compile_llvm(sscreen, &ctx.shader->binary,
- &ctx.shader->config, ctx.compiler,
- ctx.ac.module,
- debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size,
- "GS Copy Shader", false) == 0) {
- if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
- fprintf(stderr, "GS Copy Shader:\n");
- si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
-
- if (!ctx.shader->config.scratch_bytes_per_wave)
- ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
- else
- ok = true;
- }
-
- si_llvm_dispose(&ctx);
-
- if (!ok) {
- FREE(shader);
- shader = NULL;
- } else {
- si_fix_resource_usage(sscreen, shader);
- }
- return shader;
-}
-
-static void si_dump_shader_key_vs(const struct si_shader_key *key,
- const struct si_vs_prolog_bits *prolog,
- const char *prefix, FILE *f)
-{
- fprintf(f, " %s.instance_divisor_is_one = %u\n",
- prefix, prolog->instance_divisor_is_one);
- fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
- prefix, prolog->instance_divisor_is_fetched);
- fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n",
- prefix, prolog->unpack_instance_id_from_vertex_id);
- fprintf(f, " %s.ls_vgpr_fix = %u\n",
- prefix, prolog->ls_vgpr_fix);
-
- fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
- fprintf(f, " mono.vs.fix_fetch = {");
- for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
- union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
- if (i)
- fprintf(f, ", ");
- if (!fix.bits)
- fprintf(f, "0");
- else
- fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
- fix.u.num_channels_m1, fix.u.format);
- }
- fprintf(f, "}\n");
-}
-
-static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
-{
- const struct si_shader_key *key = &shader->key;
- enum pipe_shader_type shader_type = shader->selector->type;
-
- fprintf(f, "SHADER KEY\n");
-
- switch (shader_type) {
- case PIPE_SHADER_VERTEX:
- si_dump_shader_key_vs(key, &key->part.vs.prolog,
- "part.vs.prolog", f);
- fprintf(f, " as_es = %u\n", key->as_es);
- fprintf(f, " as_ls = %u\n", key->as_ls);
- fprintf(f, " as_ngg = %u\n", key->as_ngg);
- fprintf(f, " mono.u.vs_export_prim_id = %u\n",
- key->mono.u.vs_export_prim_id);
- fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
- key->opt.vs_as_prim_discard_cs);
- fprintf(f, " opt.cs_prim_type = %s\n",
- tgsi_primitive_names[key->opt.cs_prim_type]);
- fprintf(f, " opt.cs_indexed = %u\n",
- key->opt.cs_indexed);
- fprintf(f, " opt.cs_instancing = %u\n",
- key->opt.cs_instancing);
- fprintf(f, " opt.cs_primitive_restart = %u\n",
- key->opt.cs_primitive_restart);
- fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
- key->opt.cs_provoking_vertex_first);
- fprintf(f, " opt.cs_need_correct_orientation = %u\n",
- key->opt.cs_need_correct_orientation);
- fprintf(f, " opt.cs_cull_front = %u\n",
- key->opt.cs_cull_front);
- fprintf(f, " opt.cs_cull_back = %u\n",
- key->opt.cs_cull_back);
- fprintf(f, " opt.cs_cull_z = %u\n",
- key->opt.cs_cull_z);
- fprintf(f, " opt.cs_halfz_clip_space = %u\n",
- key->opt.cs_halfz_clip_space);
- break;
-
- case PIPE_SHADER_TESS_CTRL:
- if (shader->selector->screen->info.chip_class >= GFX9) {
- si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
- "part.tcs.ls_prolog", f);
- }
- fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
- fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
- break;
-
- case PIPE_SHADER_TESS_EVAL:
- fprintf(f, " as_es = %u\n", key->as_es);
- fprintf(f, " as_ngg = %u\n", key->as_ngg);
- fprintf(f, " mono.u.vs_export_prim_id = %u\n",
- key->mono.u.vs_export_prim_id);
- break;
-
- case PIPE_SHADER_GEOMETRY:
- if (shader->is_gs_copy_shader)
- break;
-
- if (shader->selector->screen->info.chip_class >= GFX9 &&
- key->part.gs.es->type == PIPE_SHADER_VERTEX) {
- si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
- "part.gs.vs_prolog", f);
- }
- fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
- fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
- fprintf(f, " as_ngg = %u\n", key->as_ngg);
- break;
-
- case PIPE_SHADER_COMPUTE:
- break;
-
- case PIPE_SHADER_FRAGMENT:
- fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
- fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
- fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
- fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
- fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
- fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
- fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
- fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
- fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
- fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
- fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
- fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
- fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
- fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
- fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
- fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
- fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
- fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
- fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
- fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
- fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
- fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
- break;
-
- default:
- assert(0);
- }
-
- if ((shader_type == PIPE_SHADER_GEOMETRY ||
- shader_type == PIPE_SHADER_TESS_EVAL ||
- shader_type == PIPE_SHADER_VERTEX) &&
- !key->as_es && !key->as_ls) {
- fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
- fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
- }
-}
-
-static void si_optimize_vs_outputs(struct si_shader_context *ctx)
-{
- struct si_shader *shader = ctx->shader;
- struct tgsi_shader_info *info = &shader->selector->info;
-
- if ((ctx->type != PIPE_SHADER_VERTEX &&
- ctx->type != PIPE_SHADER_TESS_EVAL) ||
- shader->key.as_ls ||
- shader->key.as_es)
- return;
-
- ac_optimize_vs_outputs(&ctx->ac,
- ctx->main_fn,
- shader->info.vs_output_param_offset,
- info->num_outputs,
- &shader->info.nr_param_exports);
-}
-
-static void si_init_exec_from_input(struct si_shader_context *ctx,
- struct ac_arg param, unsigned bitoffset)
-{
- LLVMValueRef args[] = {
- ac_get_arg(&ctx->ac, param),
- LLVMConstInt(ctx->i32, bitoffset, 0),
- };
- ac_build_intrinsic(&ctx->ac,
- "llvm.amdgcn.init.exec.from.input",
- ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
-}
-
-static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
- const struct si_vs_prolog_bits *key)
-{
- /* VGPR initialization fixup for Vega10 and Raven is always done in the
- * VS prolog. */
- return sel->vs_needs_prolog ||
- key->ls_vgpr_fix ||
- key->unpack_instance_id_from_vertex_id;
-}
-
-LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
-{
- /* Return true if the current thread should execute an ES thread. */
- return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
- ac_get_thread_id(&ctx->ac),
- si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
-}
-
-LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
-{
- /* Return true if the current thread should execute a GS thread. */
- return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
- ac_get_thread_id(&ctx->ac),
- si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
-}
-
-static void si_llvm_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible)
-{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMBuilderRef builder = ctx->ac.builder;
-
- if (ctx->shader->selector->force_correct_derivs_after_kill) {
- /* Kill immediately while maintaining WQM. */
- ac_build_kill_if_false(&ctx->ac,
- ac_build_wqm_vote(&ctx->ac, visible));
-
- LLVMValueRef mask = LLVMBuildLoad(builder, ctx->postponed_kill, "");
- mask = LLVMBuildAnd(builder, mask, visible, "");
- LLVMBuildStore(builder, mask, ctx->postponed_kill);
- return;
- }
-
- ac_build_kill_if_false(&ctx->ac, visible);
-}
-
-static bool si_compile_tgsi_main(struct si_shader_context *ctx,
- struct nir_shader *nir, bool free_nir)
-{
- struct si_shader *shader = ctx->shader;
- struct si_shader_selector *sel = shader->selector;
-
- // TODO clean all this up!
- switch (ctx->type) {
- case PIPE_SHADER_VERTEX:
- if (shader->key.as_ls)
- ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
- else if (shader->key.as_es)
- ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
- else if (shader->key.opt.vs_as_prim_discard_cs)
- ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
- else if (shader->key.as_ngg)
- ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
- else
- ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
- ctx->abi.load_base_vertex = get_base_vertex;
- break;
- case PIPE_SHADER_TESS_CTRL:
- ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
- ctx->abi.load_tess_level = si_load_tess_level;
- ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
- ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
- ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
- break;
- case PIPE_SHADER_TESS_EVAL:
- ctx->abi.load_tess_varyings = si_nir_load_input_tes;
- ctx->abi.load_tess_coord = si_load_tess_coord;
- ctx->abi.load_tess_level = si_load_tess_level;
- ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
- if (shader->key.as_es)
- ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
- else if (shader->key.as_ngg)
- ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
- else
- ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
- break;
- case PIPE_SHADER_GEOMETRY:
- ctx->abi.load_inputs = si_nir_load_input_gs;
- ctx->abi.emit_vertex = si_llvm_emit_vertex;
- ctx->abi.emit_primitive = si_llvm_emit_primitive;
- ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
- break;
- case PIPE_SHADER_FRAGMENT:
- ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
- ctx->abi.load_sample_position = load_sample_position;
- ctx->abi.load_sample_mask_in = load_sample_mask_in;
- ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
- ctx->abi.emit_kill = si_llvm_emit_kill;
- break;
- case PIPE_SHADER_COMPUTE:
- ctx->abi.load_local_group_size = get_block_size;
- break;
- default:
- assert(!"Unsupported shader type");
- return false;
- }
-
- ctx->abi.load_ubo = load_ubo;
- ctx->abi.load_ssbo = load_ssbo;
-
- create_function(ctx);
- preload_ring_buffers(ctx);
-
- if (ctx->type == PIPE_SHADER_TESS_CTRL &&
- sel->tcs_info.tessfactors_are_def_in_all_invocs) {
- for (unsigned i = 0; i < 6; i++) {
- ctx->invoc0_tess_factors[i] =
- ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
- }
- }
-
- if (ctx->type == PIPE_SHADER_GEOMETRY) {
- for (unsigned i = 0; i < 4; i++) {
- ctx->gs_next_vertex[i] =
- ac_build_alloca(&ctx->ac, ctx->i32, "");
- }
- if (shader->key.as_ngg) {
- for (unsigned i = 0; i < 4; ++i) {
- ctx->gs_curprim_verts[i] =
- ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
- ctx->gs_generated_prims[i] =
- ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
- }
-
- unsigned scratch_size = 8;
- if (sel->so.num_outputs)
- scratch_size = 44;
-
- LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
- ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
- LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
- LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-
- ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
- LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
- LLVMSetAlignment(ctx->gs_ngg_emit, 4);
- }
- }
-
- if (ctx->type != PIPE_SHADER_GEOMETRY &&
- (shader->key.as_ngg && !shader->key.as_es)) {
- /* Unconditionally declare scratch space base for streamout and
- * vertex compaction. Whether space is actually allocated is
- * determined during linking / PM4 creation.
- *
- * Add an extra dword per vertex to ensure an odd stride, which
- * avoids bank conflicts for SoA accesses.
- */
- if (!gfx10_is_ngg_passthrough(shader))
- declare_esgs_ring(ctx);
-
- /* This is really only needed when streamout and / or vertex
- * compaction is enabled.
- */
- if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
- LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
- ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
- LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
- LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
- }
- }
-
- /* For GFX9 merged shaders:
- * - Set EXEC for the first shader. If the prolog is present, set
- * EXEC there instead.
- * - Add a barrier before the second shader.
- * - In the second shader, reset EXEC to ~0 and wrap the main part in
- * an if-statement. This is required for correctness in geometry
- * shaders, to ensure that empty GS waves do not send GS_EMIT and
- * GS_CUT messages.
- *
- * For monolithic merged shaders, the first shader is wrapped in an
- * if-block together with its prolog in si_build_wrapper_function.
- *
- * NGG vertex and tess eval shaders running as the last
- * vertex/geometry stage handle execution explicitly using
- * if-statements.
- */
- if (ctx->screen->info.chip_class >= GFX9) {
- if (!shader->is_monolithic &&
- sel->info.num_instructions > 1 && /* not empty shader */
- (shader->key.as_es || shader->key.as_ls) &&
- (ctx->type == PIPE_SHADER_TESS_EVAL ||
- (ctx->type == PIPE_SHADER_VERTEX &&
- !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
- si_init_exec_from_input(ctx,
- ctx->merged_wave_info, 0);
- } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
- ctx->type == PIPE_SHADER_GEOMETRY ||
- (shader->key.as_ngg && !shader->key.as_es)) {
- LLVMValueRef thread_enabled;
- bool nested_barrier;
-
- if (!shader->is_monolithic ||
- (ctx->type == PIPE_SHADER_TESS_EVAL &&
- (shader->key.as_ngg && !shader->key.as_es)))
- ac_init_exec_full_mask(&ctx->ac);
-
- if (ctx->type == PIPE_SHADER_TESS_CTRL ||
- ctx->type == PIPE_SHADER_GEOMETRY) {
- if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
- gfx10_ngg_gs_emit_prologue(ctx);
- nested_barrier = false;
- } else {
- nested_barrier = true;
- }
-
- thread_enabled = si_is_gs_thread(ctx);
- } else {
- thread_enabled = si_is_es_thread(ctx);
- nested_barrier = false;
- }
-
- ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
- ctx->merged_wrap_if_label = 11500;
- ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
-
- if (nested_barrier) {
- /* Execute a barrier before the second shader in
- * a merged shader.
- *
- * Execute the barrier inside the conditional block,
- * so that empty waves can jump directly to s_endpgm,
- * which will also signal the barrier.
- *
- * This is possible in gfx9, because an empty wave
- * for the second shader does not participate in
- * the epilogue. With NGG, empty waves may still
- * be required to export data (e.g. GS output vertices),
- * so we cannot let them exit early.
- *
- * If the shader is TCS and the TCS epilog is present
- * and contains a barrier, it will wait there and then
- * reach s_endpgm.
- */
- si_llvm_emit_barrier(ctx);
- }
- }
- }
-
- if (sel->force_correct_derivs_after_kill) {
- ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
- /* true = don't kill. */
- LLVMBuildStore(ctx->ac.builder, ctx->i1true,
- ctx->postponed_kill);
- }
-
- bool success = si_nir_build_llvm(ctx, nir);
- if (free_nir)
- ralloc_free(nir);
- if (!success) {
- fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
- return false;
- }
-
- si_llvm_build_ret(ctx, ctx->return_value);
- return true;
-}
-
-/**
- * Compute the VS prolog key, which contains all the information needed to
- * build the VS prolog function, and set shader->info bits where needed.
- *
- * \param info Shader info of the vertex shader.
- * \param num_input_sgprs Number of input SGPRs for the vertex shader.
- * \param prolog_key Key of the VS prolog
- * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
- * \param key Output shader part key.
- */
-static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
- unsigned num_input_sgprs,
- const struct si_vs_prolog_bits *prolog_key,
- struct si_shader *shader_out,
- union si_shader_part_key *key)
-{
- memset(key, 0, sizeof(*key));
- key->vs_prolog.states = *prolog_key;
- key->vs_prolog.num_input_sgprs = num_input_sgprs;
- key->vs_prolog.num_inputs = info->num_inputs;
- key->vs_prolog.as_ls = shader_out->key.as_ls;
- key->vs_prolog.as_es = shader_out->key.as_es;
- key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-
- if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
- key->vs_prolog.as_ls = 1;
- key->vs_prolog.num_merged_next_stage_vgprs = 2;
- } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
- key->vs_prolog.as_es = 1;
- key->vs_prolog.num_merged_next_stage_vgprs = 5;
- } else if (shader_out->key.as_ngg) {
- key->vs_prolog.num_merged_next_stage_vgprs = 5;
- }
-
- /* Enable loading the InstanceID VGPR. */
- uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
-
- if ((key->vs_prolog.states.instance_divisor_is_one |
- key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
- shader_out->info.uses_instanceid = true;
-}
-
-/**
- * Compute the PS prolog key, which contains all the information needed to
- * build the PS prolog function, and set related bits in shader->config.
- */
-static void si_get_ps_prolog_key(struct si_shader *shader,
- union si_shader_part_key *key,
- bool separate_prolog)
-{
- struct tgsi_shader_info *info = &shader->selector->info;
-
- memset(key, 0, sizeof(*key));
- key->ps_prolog.states = shader->key.part.ps.prolog;
- key->ps_prolog.colors_read = info->colors_read;
- key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
- key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
- key->ps_prolog.wqm = info->uses_derivatives &&
- (key->ps_prolog.colors_read ||
- key->ps_prolog.states.force_persp_sample_interp ||
- key->ps_prolog.states.force_linear_sample_interp ||
- key->ps_prolog.states.force_persp_center_interp ||
- key->ps_prolog.states.force_linear_center_interp ||
- key->ps_prolog.states.bc_optimize_for_persp ||
- key->ps_prolog.states.bc_optimize_for_linear);
- key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
-
- if (info->colors_read) {
- unsigned *color = shader->selector->color_attr_index;
-
- if (shader->key.part.ps.prolog.color_two_side) {
- /* BCOLORs are stored after the last input. */
- key->ps_prolog.num_interp_inputs = info->num_inputs;
- key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
- if (separate_prolog)
- shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
- }
-
- for (unsigned i = 0; i < 2; i++) {
- unsigned interp = info->input_interpolate[color[i]];
- unsigned location = info->input_interpolate_loc[color[i]];
-
- if (!(info->colors_read & (0xf << i*4)))
- continue;
-
- key->ps_prolog.color_attr_index[i] = color[i];
-
- if (shader->key.part.ps.prolog.flatshade_colors &&
- interp == TGSI_INTERPOLATE_COLOR)
- interp = TGSI_INTERPOLATE_CONSTANT;
-
- switch (interp) {
- case TGSI_INTERPOLATE_CONSTANT:
- key->ps_prolog.color_interp_vgpr_index[i] = -1;
- break;
- case TGSI_INTERPOLATE_PERSPECTIVE:
- case TGSI_INTERPOLATE_COLOR:
- /* Force the interpolation location for colors here. */
- if (shader->key.part.ps.prolog.force_persp_sample_interp)
- location = TGSI_INTERPOLATE_LOC_SAMPLE;
- if (shader->key.part.ps.prolog.force_persp_center_interp)
- location = TGSI_INTERPOLATE_LOC_CENTER;
-
- switch (location) {
- case TGSI_INTERPOLATE_LOC_SAMPLE:
- key->ps_prolog.color_interp_vgpr_index[i] = 0;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_PERSP_SAMPLE_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTER:
- key->ps_prolog.color_interp_vgpr_index[i] = 2;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_PERSP_CENTER_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTROID:
- key->ps_prolog.color_interp_vgpr_index[i] = 4;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_PERSP_CENTROID_ENA(1);
- }
- break;
- default:
- assert(0);
- }
- break;
- case TGSI_INTERPOLATE_LINEAR:
- /* Force the interpolation location for colors here. */
- if (shader->key.part.ps.prolog.force_linear_sample_interp)
- location = TGSI_INTERPOLATE_LOC_SAMPLE;
- if (shader->key.part.ps.prolog.force_linear_center_interp)
- location = TGSI_INTERPOLATE_LOC_CENTER;
-
- /* The VGPR assignment for non-monolithic shaders
- * works because InitialPSInputAddr is set on the
- * main shader and PERSP_PULL_MODEL is never used.
- */
- switch (location) {
- case TGSI_INTERPOLATE_LOC_SAMPLE:
- key->ps_prolog.color_interp_vgpr_index[i] =
- separate_prolog ? 6 : 9;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_LINEAR_SAMPLE_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTER:
- key->ps_prolog.color_interp_vgpr_index[i] =
- separate_prolog ? 8 : 11;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_LINEAR_CENTER_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTROID:
- key->ps_prolog.color_interp_vgpr_index[i] =
- separate_prolog ? 10 : 13;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_LINEAR_CENTROID_ENA(1);
- }
- break;
- default:
- assert(0);
- }
- break;
- default:
- assert(0);
- }
- }
- }
-}
-
-/**
- * Check whether a PS prolog is required based on the key.
- */
-static bool si_need_ps_prolog(const union si_shader_part_key *key)
-{
- return key->ps_prolog.colors_read ||
- key->ps_prolog.states.force_persp_sample_interp ||
- key->ps_prolog.states.force_linear_sample_interp ||
- key->ps_prolog.states.force_persp_center_interp ||
- key->ps_prolog.states.force_linear_center_interp ||
- key->ps_prolog.states.bc_optimize_for_persp ||
- key->ps_prolog.states.bc_optimize_for_linear ||
- key->ps_prolog.states.poly_stipple ||
- key->ps_prolog.states.samplemask_log_ps_iter;
-}
-
-/**
- * Compute the PS epilog key, which contains all the information needed to
- * build the PS epilog function.
- */
-static void si_get_ps_epilog_key(struct si_shader *shader,
- union si_shader_part_key *key)
-{
- struct tgsi_shader_info *info = &shader->selector->info;
- memset(key, 0, sizeof(*key));
- key->ps_epilog.colors_written = info->colors_written;
- key->ps_epilog.writes_z = info->writes_z;
- key->ps_epilog.writes_stencil = info->writes_stencil;
- key->ps_epilog.writes_samplemask = info->writes_samplemask;
- key->ps_epilog.states = shader->key.part.ps.epilog;
-}
-
-/**
- * Build the GS prolog function. Rotate the input vertices for triangle strips
- * with adjacency.
- */
-static void si_build_gs_prolog_function(struct si_shader_context *ctx,
- union si_shader_part_key *key)
-{
- unsigned num_sgprs, num_vgprs;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMTypeRef returns[AC_MAX_ARGS];
- LLVMValueRef func, ret;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- if (ctx->screen->info.chip_class >= GFX9) {
- if (key->gs_prolog.states.gfx9_prev_is_vs)
- num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
- else
- num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
- num_vgprs = 5; /* ES inputs are not needed by GS */
- } else {
- num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
- num_vgprs = 8;
- }