X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=e0503908ee3e43fedc4781d142daf85d6ff8166d;hb=3e8bda66aee72360420b3f64bebf5f6deb36a7d7;hp=84ab586caacf41b8112c25c6a072142e5b25a174;hpb=4c31f3dcc021a4e317f35f29442742320b86cd20;p=mesa.git diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 84ab586caac..e0503908ee3 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -48,14 +48,15 @@ struct radv_shader_context { struct ac_llvm_context ac; const struct radv_nir_compiler_options *options; - struct radv_shader_variant_info *shader_info; + struct radv_shader_info *shader_info; + const struct nir_shader *shader; struct ac_shader_abi abi; unsigned max_workgroup_size; LLVMContextRef context; LLVMValueRef main_function; - LLVMValueRef descriptor_sets[RADV_UD_MAX_SETS]; + LLVMValueRef descriptor_sets[MAX_SETS]; LLVMValueRef ring_offsets; LLVMValueRef vertex_buffers; @@ -70,6 +71,13 @@ struct radv_shader_context { LLVMValueRef tes_u; LLVMValueRef tes_v; + /* HW GS */ + /* On gfx10: + * - bits 0..10: ordered_wave_id + * - bits 12..20: number of vertices in group + * - bits 22..30: number of primitives in group + */ + LLVMValueRef gs_tg_info; LLVMValueRef gs2vs_offset; LLVMValueRef gs_wave_id; LLVMValueRef gs_vtx_offset[6]; @@ -79,9 +87,6 @@ struct radv_shader_context { LLVMValueRef hs_ring_tess_offchip; LLVMValueRef hs_ring_tess_factor; - LLVMValueRef persp_sample, persp_center, persp_centroid; - LLVMValueRef linear_sample, linear_center, linear_centroid; - /* Streamout */ LLVMValueRef streamout_buffers; LLVMValueRef streamout_write_idx; @@ -91,24 +96,29 @@ struct radv_shader_context { gl_shader_stage stage; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; - uint64_t float16_shaded_mask; - uint64_t input_mask; uint64_t output_mask; bool is_gs_copy_shader; LLVMValueRef gs_next_vertex[4]; - unsigned gs_max_out_vertices; + LLVMValueRef gs_curprim_verts[4]; + LLVMValueRef gs_generated_prims[4]; + LLVMValueRef gs_ngg_emit; + LLVMValueRef gs_ngg_scratch; - unsigned tes_primitive_mode; - - uint32_t tcs_patch_outputs_read; - uint64_t tcs_outputs_read; - uint32_t tcs_vertices_per_patch; uint32_t tcs_num_inputs; uint32_t tcs_num_patches; uint32_t max_gsvs_emit_size; uint32_t gsvs_vertex_size; + + LLVMValueRef vertexptr; /* GFX10 only */ +}; + +struct radv_shader_output_values { + LLVMValueRef values[4]; + unsigned slot_name; + unsigned slot_index; + unsigned usage_mask; }; enum radeon_llvm_calling_convention { @@ -126,98 +136,6 @@ radv_shader_context_from_abi(struct ac_shader_abi *abi) return container_of(abi, ctx, abi); } -struct ac_build_if_state -{ - struct radv_shader_context *ctx; - LLVMValueRef condition; - LLVMBasicBlockRef entry_block; - LLVMBasicBlockRef true_block; - LLVMBasicBlockRef false_block; - LLVMBasicBlockRef merge_block; -}; - -static LLVMBasicBlockRef -ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name) -{ - LLVMBasicBlockRef current_block; - LLVMBasicBlockRef next_block; - LLVMBasicBlockRef new_block; - - /* get current basic block */ - current_block = LLVMGetInsertBlock(ctx->ac.builder); - - /* chqeck if there's another block after this one */ - next_block = LLVMGetNextBasicBlock(current_block); - if (next_block) { - /* insert the new block before the next block */ - new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name); - } - else { - /* append new block after current block */ - LLVMValueRef function = LLVMGetBasicBlockParent(current_block); - new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name); - } - return new_block; -} - -static void -ac_nir_build_if(struct ac_build_if_state *ifthen, - struct radv_shader_context *ctx, - LLVMValueRef condition) -{ - LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder); - - memset(ifthen, 0, sizeof *ifthen); - ifthen->ctx = ctx; - ifthen->condition = condition; - ifthen->entry_block = block; - - /* create endif/merge basic block for the phi functions */ - ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block"); - - /* create/insert true_block before merge_block */ - ifthen->true_block = - LLVMInsertBasicBlockInContext(ctx->context, - ifthen->merge_block, - "if-true-block"); - - /* successive code goes into the true block */ - LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block); -} - -/** - * End a conditional. - */ -static void -ac_nir_build_endif(struct ac_build_if_state *ifthen) -{ - LLVMBuilderRef builder = ifthen->ctx->ac.builder; - - /* Insert branch to the merge block from current block */ - LLVMBuildBr(builder, ifthen->merge_block); - - /* - * Now patch in the various branch instructions. - */ - - /* Insert the conditional branch instruction at the end of entry_block */ - LLVMPositionBuilderAtEnd(builder, ifthen->entry_block); - if (ifthen->false_block) { - /* we have an else clause */ - LLVMBuildCondBr(builder, ifthen->condition, - ifthen->true_block, ifthen->false_block); - } - else { - /* no else clause */ - LLVMBuildCondBr(builder, ifthen->condition, - ifthen->true_block, ifthen->merge_block); - } - - /* Resume building code at end of the ifthen->merge_block */ - LLVMPositionBuilderAtEnd(builder, ifthen->merge_block); -} - - static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) { switch (ctx->stage) { @@ -235,13 +153,13 @@ static unsigned get_tcs_num_patches(struct radv_shader_context *ctx) { unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch; + unsigned num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; unsigned num_patches; unsigned hardware_lds_size; @@ -274,7 +192,7 @@ get_tcs_num_patches(struct radv_shader_context *ctx) /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ if (ctx->options->chip_class == GFX6) { - unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); + unsigned one_wave = ctx->options->wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp); num_patches = MIN2(num_patches, one_wave); } return num_patches; @@ -293,9 +211,9 @@ calculate_tess_lds_size(struct radv_shader_context *ctx) unsigned num_patches; unsigned lds_size; - num_tcs_output_cp = ctx->tcs_vertices_per_patch; - num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); - num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; + num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); input_vertex_size = ctx->tcs_num_inputs * 16; output_vertex_size = num_tcs_outputs * 16; @@ -346,10 +264,10 @@ get_tcs_in_patch_stride(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_out_patch_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; output_patch_size /= 4; return LLVMConstInt(ctx->ac.i32, output_patch_size, false); @@ -358,7 +276,7 @@ get_tcs_out_patch_stride(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_out_vertex_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; output_vertex_size /= 4; return LLVMConstInt(ctx->ac.i32, output_vertex_size, false); @@ -386,9 +304,9 @@ get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned num_patches = ctx->tcs_num_patches; output_patch0_offset *= num_patches; @@ -593,17 +511,17 @@ static bool needs_view_index_sgpr(struct radv_shader_context *ctx, { switch (stage) { case MESA_SHADER_VERTEX: - if (ctx->shader_info->info.needs_multiview_view_index || - (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index)) + if (ctx->shader_info->needs_multiview_view_index || + (!ctx->options->key.vs_common_out.as_es && !ctx->options->key.vs_common_out.as_ls && ctx->options->key.has_multiview_view_index)) return true; break; case MESA_SHADER_TESS_EVAL: - if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index)) + if (ctx->shader_info->needs_multiview_view_index || (!ctx->options->key.vs_common_out.as_es && ctx->options->key.has_multiview_view_index)) return true; break; case MESA_SHADER_GEOMETRY: case MESA_SHADER_TESS_CTRL: - if (ctx->shader_info->info.needs_multiview_view_index) + if (ctx->shader_info->needs_multiview_view_index) return true; break; default: @@ -617,9 +535,9 @@ count_vs_user_sgprs(struct radv_shader_context *ctx) { uint8_t count = 0; - if (ctx->shader_info->info.vs.has_vertex_buffers) + if (ctx->shader_info->vs.has_vertex_buffers) count++; - count += ctx->shader_info->info.vs.needs_draw_id ? 3 : 2; + count += ctx->shader_info->vs.needs_draw_id ? 3 : 2; return count; } @@ -630,42 +548,42 @@ static void allocate_inline_push_consts(struct radv_shader_context *ctx, uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; /* Only supported if shaders use push constants. */ - if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX) + if (ctx->shader_info->min_push_constant_used == UINT8_MAX) return; /* Only supported if shaders don't have indirect push constants. */ - if (ctx->shader_info->info.has_indirect_push_constants) + if (ctx->shader_info->has_indirect_push_constants) return; /* Only supported for 32-bit push constants. */ - if (!ctx->shader_info->info.has_only_32bit_push_constants) + if (!ctx->shader_info->has_only_32bit_push_constants) return; uint8_t num_push_consts = - (ctx->shader_info->info.max_push_constant_used - - ctx->shader_info->info.min_push_constant_used) / 4; + (ctx->shader_info->max_push_constant_used - + ctx->shader_info->min_push_constant_used) / 4; /* Check if the number of user SGPRs is large enough. */ if (num_push_consts < remaining_sgprs) { - ctx->shader_info->info.num_inline_push_consts = num_push_consts; + ctx->shader_info->num_inline_push_consts = num_push_consts; } else { - ctx->shader_info->info.num_inline_push_consts = remaining_sgprs; + ctx->shader_info->num_inline_push_consts = remaining_sgprs; } /* Clamp to the maximum number of allowed inlined push constants. */ - if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) - ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; + if (ctx->shader_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) + ctx->shader_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; - if (ctx->shader_info->info.num_inline_push_consts == num_push_consts && - !ctx->shader_info->info.loads_dynamic_offsets) { + if (ctx->shader_info->num_inline_push_consts == num_push_consts && + !ctx->shader_info->loads_dynamic_offsets) { /* Disable the default push constants path if all constants are * inlined and if shaders don't use dynamic descriptors. */ - ctx->shader_info->info.loads_push_constants = false; + ctx->shader_info->loads_push_constants = false; } - ctx->shader_info->info.base_inline_push_consts = - ctx->shader_info->info.min_push_constant_used / 4; + ctx->shader_info->base_inline_push_consts = + ctx->shader_info->min_push_constant_used / 4; } static void allocate_user_sgprs(struct radv_shader_context *ctx, @@ -688,7 +606,7 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, user_sgpr_info->need_ring_offsets = true; if (stage == MESA_SHADER_FRAGMENT && - ctx->shader_info->info.ps.needs_sample_positions) + ctx->shader_info->ps.needs_sample_positions) user_sgpr_info->need_ring_offsets = true; /* 2 user sgprs will nearly always be allocated for scratch/rings */ @@ -698,11 +616,11 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, switch (stage) { case MESA_SHADER_COMPUTE: - if (ctx->shader_info->info.cs.uses_grid_size) + if (ctx->shader_info->cs.uses_grid_size) user_sgpr_count += 3; break; case MESA_SHADER_FRAGMENT: - user_sgpr_count += ctx->shader_info->info.ps.needs_sample_positions; + user_sgpr_count += ctx->shader_info->ps.needs_sample_positions; break; case MESA_SHADER_VERTEX: if (!ctx->is_gs_copy_shader) @@ -730,7 +648,7 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, if (needs_view_index) user_sgpr_count++; - if (ctx->shader_info->info.loads_push_constants) + if (ctx->shader_info->loads_push_constants) user_sgpr_count++; if (ctx->streamout_buffers) @@ -739,7 +657,7 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; uint32_t num_desc_set = - util_bitcount(ctx->shader_info->info.desc_set_used_mask); + util_bitcount(ctx->shader_info->desc_set_used_mask); if (remaining_sgprs < num_desc_set) { user_sgpr_info->indirect_all_descriptor_sets = true; @@ -761,7 +679,7 @@ declare_global_input_sgprs(struct radv_shader_context *ctx, /* 1 for each descriptor set */ if (!user_sgpr_info->indirect_all_descriptor_sets) { - uint32_t mask = ctx->shader_info->info.desc_set_used_mask; + uint32_t mask = ctx->shader_info->desc_set_used_mask; while (mask) { int i = u_bit_scan(&mask); @@ -773,19 +691,19 @@ declare_global_input_sgprs(struct radv_shader_context *ctx, desc_sets); } - if (ctx->shader_info->info.loads_push_constants) { + if (ctx->shader_info->loads_push_constants) { /* 1 for push constants and dynamic descriptors */ add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants); } - for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) { + for (unsigned i = 0; i < ctx->shader_info->num_inline_push_consts; i++) { add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.inline_push_consts[i]); } - ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts; - ctx->abi.base_inline_push_consts = ctx->shader_info->info.base_inline_push_consts; + ctx->abi.num_inline_push_consts = ctx->shader_info->num_inline_push_consts; + ctx->abi.base_inline_push_consts = ctx->shader_info->base_inline_push_consts; - if (ctx->shader_info->info.so.num_outputs) { + if (ctx->shader_info->so.num_outputs) { add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(ctx->ac.v4i32), &ctx->streamout_buffers); @@ -802,14 +720,14 @@ declare_vs_specific_input_sgprs(struct radv_shader_context *ctx, if (!ctx->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->info.vs.has_vertex_buffers) { + if (ctx->shader_info->vs.has_vertex_buffers) { add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(ctx->ac.v4i32), &ctx->vertex_buffers); } add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex); add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance); - if (ctx->shader_info->info.vs.needs_draw_id) { + if (ctx->shader_info->vs.needs_draw_id) { add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id); } } @@ -820,14 +738,32 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) { add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id); if (!ctx->is_gs_copy_shader) { - if (ctx->options->key.vs.as_ls) { + if (ctx->options->key.vs_common_out.as_ls) { add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + if (ctx->ac.chip_class >= GFX10) { + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + } else { + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ + } } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); + if (ctx->ac.chip_class >= GFX10) { + if (ctx->options->key.vs_common_out.as_ngg) { + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + } else { + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + } + } else { + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); + add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ + } } - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ } } @@ -838,7 +774,7 @@ declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, int i; /* Streamout SGPRs. */ - if (ctx->shader_info->info.so.num_outputs) { + if (ctx->shader_info->so.num_outputs) { assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL); @@ -854,7 +790,7 @@ declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, /* A streamout buffer offset is loaded if the stride is non-zero. */ for (i = 0; i < 4; i++) { - if (!ctx->shader_info->info.so.strides[i]) + if (!ctx->shader_info->so.strides[i]) continue; add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_offset[i]); @@ -875,7 +811,7 @@ set_global_input_locs(struct radv_shader_context *ctx, const struct user_sgpr_info *user_sgpr_info, LLVMValueRef desc_sets, uint8_t *user_sgpr_idx) { - uint32_t mask = ctx->shader_info->info.desc_set_used_mask; + uint32_t mask = ctx->shader_info->desc_set_used_mask; if (!user_sgpr_info->indirect_all_descriptor_sets) { while (mask) { @@ -899,13 +835,13 @@ set_global_input_locs(struct radv_shader_context *ctx, ctx->shader_info->need_indirect_descriptor_sets = true; } - if (ctx->shader_info->info.loads_push_constants) { + if (ctx->shader_info->loads_push_constants) { set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); } - if (ctx->shader_info->info.num_inline_push_consts) { + if (ctx->shader_info->num_inline_push_consts) { set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, - ctx->shader_info->info.num_inline_push_consts); + ctx->shader_info->num_inline_push_consts); } if (ctx->streamout_buffers) { @@ -923,13 +859,13 @@ set_vs_specific_input_locs(struct radv_shader_context *ctx, if (!ctx->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->info.vs.has_vertex_buffers) { + if (ctx->shader_info->vs.has_vertex_buffers) { set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx); } unsigned vs_num = 2; - if (ctx->shader_info->info.vs.needs_draw_id) + if (ctx->shader_info->vs.needs_draw_id) vs_num++; set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, @@ -966,6 +902,12 @@ static void set_llvm_calling_convention(LLVMValueRef func, LLVMSetFunctionCallConv(func, calling_conv); } +/* Returns whether the stage is a stage that can be directly before the GS */ +static bool is_pre_gs_stage(gl_shader_stage stage) +{ + return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL; +} + static void create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage, @@ -976,6 +918,16 @@ static void create_function(struct radv_shader_context *ctx, struct arg_info args = {}; LLVMValueRef desc_sets; bool needs_view_index = needs_view_index_sgpr(ctx, stage); + + if (ctx->ac.chip_class >= GFX10) { + if (is_pre_gs_stage(stage) && ctx->options->key.vs_common_out.as_ngg) { + /* On GFX10, VS is merged into GS for NGG. */ + previous_stage = stage; + stage = MESA_SHADER_GEOMETRY; + has_previous_stage = true; + } + } + allocate_user_sgprs(ctx, stage, has_previous_stage, previous_stage, needs_view_index, &user_sgpr_info); @@ -989,20 +941,20 @@ static void create_function(struct radv_shader_context *ctx, declare_global_input_sgprs(ctx, &user_sgpr_info, &args, &desc_sets); - if (ctx->shader_info->info.cs.uses_grid_size) { + if (ctx->shader_info->cs.uses_grid_size) { add_arg(&args, ARG_SGPR, ctx->ac.v3i32, &ctx->abi.num_work_groups); } for (int i = 0; i < 3; i++) { ctx->abi.workgroup_ids[i] = NULL; - if (ctx->shader_info->info.cs.uses_block_id[i]) { + if (ctx->shader_info->cs.uses_block_id[i]) { add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.workgroup_ids[i]); } } - if (ctx->shader_info->info.cs.uses_local_invocation_idx) + if (ctx->shader_info->cs.uses_local_invocation_idx) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.tg_size); add_arg(&args, ARG_VGPR, ctx->ac.v3i32, &ctx->abi.local_invocation_ids); @@ -1017,10 +969,10 @@ static void create_function(struct radv_shader_context *ctx, if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.view_index); - if (ctx->options->key.vs.as_es) { + if (ctx->options->key.vs_common_out.as_es) { add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->es2gs_offset); - } else if (ctx->options->key.vs.as_ls) { + } else if (ctx->options->key.vs_common_out.as_ls) { /* no extra parameters */ } else { declare_streamout_sgprs(ctx, stage, &args); @@ -1083,7 +1035,7 @@ static void create_function(struct radv_shader_context *ctx, add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.view_index); - if (ctx->options->key.tes.as_es) { + if (ctx->options->key.vs_common_out.as_es) { add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); add_arg(&args, ARG_SGPR, ctx->ac.i32, @@ -1098,8 +1050,14 @@ static void create_function(struct radv_shader_context *ctx, case MESA_SHADER_GEOMETRY: if (has_previous_stage) { // First 6 system regs - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs2vs_offset); + if (ctx->options->key.vs_common_out.as_ngg) { + add_arg(&args, ARG_SGPR, ctx->ac.i32, + &ctx->gs_tg_info); + } else { + add_arg(&args, ARG_SGPR, ctx->ac.i32, + &ctx->gs2vs_offset); + } + add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->merged_wave_info); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); @@ -1171,13 +1129,13 @@ static void create_function(struct radv_shader_context *ctx, &desc_sets); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_centroid); + add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_sample); + add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_center); + add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_centroid); add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */ - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_centroid); + add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_sample); + add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_center); + add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_centroid); add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL); /* line stipple tex */ add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]); add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]); @@ -1231,7 +1189,7 @@ static void create_function(struct radv_shader_context *ctx, switch (stage) { case MESA_SHADER_COMPUTE: - if (ctx->shader_info->info.cs.uses_grid_size) { + if (ctx->shader_info->cs.uses_grid_size) { set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, 3); } @@ -1270,7 +1228,7 @@ static void create_function(struct radv_shader_context *ctx, } if (stage == MESA_SHADER_TESS_CTRL || - (stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_ls) || + (stage == MESA_SHADER_VERTEX && ctx->options->key.vs_common_out.as_ls) || /* GFX9 has the ESGS ring buffer in LDS. */ (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { ac_declare_lds_as_pointer(&ctx->ac); @@ -1315,9 +1273,16 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (ctx->ac.chip_class >= GFX10) { + desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(3) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } LLVMValueRef desc_components[4] = { LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""), @@ -1357,12 +1322,12 @@ static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx) uint32_t num_patches = ctx->tcs_num_patches; uint32_t num_tcs_outputs; if (ctx->stage == MESA_SHADER_TESS_CTRL) - num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); else num_tcs_outputs = ctx->options->key.tes.tcs_num_outputs; uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * num_patches, false); } @@ -1372,7 +1337,7 @@ static LLVMValueRef calc_param_stride(struct radv_shader_context *ctx, { LLVMValueRef param_stride; if (vertex_index) - param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch * ctx->tcs_num_patches, false); + param_stride = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out * ctx->tcs_num_patches, false); else param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_num_patches, false); return param_stride; @@ -1385,7 +1350,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx, LLVMValueRef base_addr; LLVMValueRef param_stride, constant16; LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch, false); + LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->shader->info.tess.tcs_vertices_out, false); constant16 = LLVMConstInt(ctx->ac.i32, 16, false); param_stride = calc_param_stride(ctx, vertex_index); if (vertex_index) { @@ -1532,10 +1497,10 @@ store_tcs_output(struct ac_shader_abi *abi, bool store_lds = true; if (is_patch) { - if (!(ctx->tcs_patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0)))) + if (!(ctx->shader->info.patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0)))) store_lds = false; } else { - if (!(ctx->tcs_outputs_read & (1ULL << location))) + if (!(ctx->shader->info.outputs_read & (1ULL << location))) store_lds = false; } @@ -1694,36 +1659,6 @@ static void radv_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible) ac_build_kill_if_false(&ctx->ac, visible); } -static LLVMValueRef lookup_interp_param(struct ac_shader_abi *abi, - enum glsl_interp_mode interp, unsigned location) -{ - struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - - switch (interp) { - case INTERP_MODE_FLAT: - default: - return NULL; - case INTERP_MODE_SMOOTH: - case INTERP_MODE_NONE: - if (location == INTERP_CENTER) - return ctx->persp_center; - else if (location == INTERP_CENTROID) - return ctx->persp_centroid; - else if (location == INTERP_SAMPLE) - return ctx->persp_sample; - break; - case INTERP_MODE_NOPERSPECTIVE: - if (location == INTERP_CENTER) - return ctx->linear_center; - else if (location == INTERP_CENTROID) - return ctx->linear_centroid; - else if (location == INTERP_SAMPLE) - return ctx->linear_sample; - break; - } - return NULL; -} - static uint32_t radv_get_sample_pos_offset(uint32_t num_samples) { @@ -1774,7 +1709,7 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); uint8_t log2_ps_iter_samples; - if (ctx->shader_info->info.ps.force_persample) { + if (ctx->shader_info->ps.force_persample) { log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples); } else { @@ -1802,6 +1737,10 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) } +static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, + unsigned stream, + LLVMValueRef *addrs); + static void visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs) { @@ -1810,6 +1749,11 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr unsigned offset = 0; struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + if (ctx->options->key.vs_common_out.as_ngg) { + gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + return; + } + /* Write vertex attribute values to GSVS ring */ gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], @@ -1821,14 +1765,14 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr * effects other than emitting vertices. */ can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), ""); + LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); ac_build_kill_if_false(&ctx->ac, can_emit); for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + ctx->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->info.gs.output_streams[i]; + ctx->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -1844,7 +1788,7 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr out_ptr[j], ""); LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, offset * - ctx->gs_max_out_vertices, false); + ctx->shader->info.gs.vertices_out, false); offset++; @@ -1875,6 +1819,12 @@ static void visit_end_primitive(struct ac_shader_abi *abi, unsigned stream) { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + + if (ctx->options->key.vs_common_out.as_ngg) { + LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); + return; + } + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); } @@ -1890,7 +1840,7 @@ load_tess_coord(struct ac_shader_abi *abi) ctx->ac.f32_0, }; - if (ctx->tes_primitive_mode == GL_TRIANGLES) + if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), ""); @@ -2166,7 +2116,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx, LLVMValueRef buffer_index; unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); uint8_t input_usage_mask = - ctx->shader_info->info.vs.input_usage_mask[variable->data.location]; + ctx->shader_info->vs.input_usage_mask[variable->data.location]; unsigned num_input_channels = util_last_bit(input_usage_mask); variable->data.driver_location = variable->data.location * 4; @@ -2273,131 +2223,6 @@ handle_vs_input_decl(struct radv_shader_context *ctx, } } -static void interp_fs_input(struct radv_shader_context *ctx, - unsigned attr, - LLVMValueRef interp_param, - LLVMValueRef prim_mask, - bool float16, - LLVMValueRef result[4]) -{ - LLVMValueRef attr_number; - unsigned chan; - LLVMValueRef i, j; - bool interp = !LLVMIsUndef(interp_param); - - attr_number = LLVMConstInt(ctx->ac.i32, attr, false); - - /* fs.constant returns the param from the middle vertex, so it's not - * really useful for flat shading. It's meant to be used for custom - * interpolation (but the intrinsic can't fetch from the other two - * vertices). - * - * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state - * to do the right thing. The only reason we use fs.constant is that - * fs.interp cannot be used on integers, because they can be equal - * to NaN. - */ - if (interp) { - interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, - ctx->ac.v2f32, ""); - - i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, - ctx->ac.i32_0, ""); - j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, - ctx->ac.i32_1, ""); - } - - for (chan = 0; chan < 4; chan++) { - LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); - - if (interp && float16) { - result[chan] = ac_build_fs_interp_f16(&ctx->ac, - llvm_chan, - attr_number, - prim_mask, i, j); - } else if (interp) { - result[chan] = ac_build_fs_interp(&ctx->ac, - llvm_chan, - attr_number, - prim_mask, i, j); - } else { - result[chan] = ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->ac.i32, 2, false), - llvm_chan, - attr_number, - prim_mask); - result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, ""); - result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, ""); - } - } -} - -static void mark_16bit_fs_input(struct radv_shader_context *ctx, - const struct glsl_type *type, - int location) -{ - if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { - unsigned attrib_count = glsl_count_attribute_slots(type, false); - if (glsl_type_is_16bit(type)) { - ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; - } - } else if (glsl_type_is_array(type)) { - unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); - for (unsigned i = 0; i < glsl_get_length(type); ++i) { - mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride); - } - } else { - assert(glsl_type_is_struct_or_ifc(type)); - for (unsigned i = 0; i < glsl_get_length(type); i++) { - mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location); - location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); - } - } -} - -static void -handle_fs_input_decl(struct radv_shader_context *ctx, - struct nir_variable *variable) -{ - int idx = variable->data.location; - unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - LLVMValueRef interp = NULL; - uint64_t mask; - - variable->data.driver_location = idx * 4; - - - if (variable->data.compact) { - unsigned component_count = variable->data.location_frac + - glsl_get_length(variable->type); - attrib_count = (component_count + 3) / 4; - } else - mark_16bit_fs_input(ctx, variable->type, idx); - - mask = ((1ull << attrib_count) - 1) << variable->data.location; - - if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT || - glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT16 || - glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_STRUCT) { - unsigned interp_type; - if (variable->data.sample) - interp_type = INTERP_SAMPLE; - else if (variable->data.centroid) - interp_type = INTERP_CENTROID; - else - interp_type = INTERP_CENTER; - - interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); - } - if (interp == NULL) - interp = LLVMGetUndef(ctx->ac.i32); - - for (unsigned i = 0; i < attrib_count; ++i) - ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp; - - ctx->input_mask |= mask; -} - static void handle_vs_inputs(struct radv_shader_context *ctx, struct nir_shader *nir) { @@ -2424,73 +2249,9 @@ prepare_interp_optimize(struct radv_shader_context *ctx, if (uses_center && uses_centroid) { LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, ""); - ctx->persp_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->persp_center, ctx->persp_centroid, ""); - ctx->linear_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->linear_center, ctx->linear_centroid, ""); - } -} - -static void -handle_fs_inputs(struct radv_shader_context *ctx, - struct nir_shader *nir) -{ - prepare_interp_optimize(ctx, nir); - - nir_foreach_variable(variable, &nir->inputs) - handle_fs_input_decl(ctx, variable); - - unsigned index = 0; - - if (ctx->shader_info->info.ps.uses_input_attachments || - ctx->shader_info->info.needs_multiview_view_index) { - ctx->input_mask |= 1ull << VARYING_SLOT_LAYER; - ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = LLVMGetUndef(ctx->ac.i32); - } - - for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { - LLVMValueRef interp_param; - LLVMValueRef *inputs = ctx->inputs +ac_llvm_reg_index_soa(i, 0); - - if (!(ctx->input_mask & (1ull << i))) - continue; - - if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || - i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) { - interp_param = *inputs; - bool float16 = (ctx->float16_shaded_mask >> i) & 1; - interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16, - inputs); - - if (LLVMIsUndef(interp_param)) - ctx->shader_info->fs.flat_shaded_mask |= 1u << index; - if (float16) - ctx->shader_info->fs.float16_shaded_mask |= 1u << index; - if (i >= VARYING_SLOT_VAR0) - ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; - ++index; - } else if (i == VARYING_SLOT_CLIP_DIST0) { - int length = ctx->shader_info->info.ps.num_input_clips_culls; - - for (unsigned j = 0; j < length; j += 4) { - inputs = ctx->inputs + ac_llvm_reg_index_soa(i, j); - - interp_param = *inputs; - interp_fs_input(ctx, index, interp_param, - ctx->abi.prim_mask, false, inputs); - ++index; - } - } else if (i == VARYING_SLOT_POS) { - for(int i = 0; i < 3; ++i) - inputs[i] = ctx->abi.frag_pos[i]; - - inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, - ctx->abi.frag_pos[3]); - } + ctx->abi.persp_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->abi.persp_center, ctx->abi.persp_centroid, ""); + ctx->abi.linear_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->abi.linear_center, ctx->abi.linear_centroid, ""); } - ctx->shader_info->fs.num_interp = index; - ctx->shader_info->fs.input_mask = ctx->input_mask >> VARYING_SLOT_VAR0; - - if (ctx->shader_info->info.needs_multiview_view_index) - ctx->abi.view_index = ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; } static void @@ -2516,22 +2277,6 @@ scan_shader_output_decl(struct radv_shader_context *ctx, } mask_attribs = ((1ull << attrib_count) - 1) << idx; - if (stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL || - stage == MESA_SHADER_GEOMETRY) { - if (idx == VARYING_SLOT_CLIP_DIST0) { - if (stage == MESA_SHADER_VERTEX) { - ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; - ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; - ctx->shader_info->vs.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; - } - if (stage == MESA_SHADER_TESS_EVAL) { - ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; - ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; - ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; - } - } - } ctx->output_mask |= mask_attribs; } @@ -2716,9 +2461,7 @@ radv_export_param(struct radv_shader_context *ctx, unsigned index, static LLVMValueRef radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan) { - LLVMValueRef output = - ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)]; - + LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)]; return LLVMBuildLoad(ctx->ac.builder, output, ""); } @@ -2726,10 +2469,10 @@ static void radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers, LLVMValueRef const *so_write_offsets, - const struct radv_stream_output *output) + const struct radv_stream_output *output, + struct radv_shader_output_values *shader_out) { unsigned num_comps = util_bitcount(output->component_mask); - unsigned loc = output->location; unsigned buf = output->buffer; unsigned offset = output->offset; unsigned start; @@ -2744,8 +2487,7 @@ radv_emit_stream_output(struct radv_shader_context *ctx, /* Load the output as int. */ for (int i = 0; i < num_comps; i++) { - out[i] = ac_to_integer(&ctx->ac, - radv_load_output(ctx, loc, start + i)); + out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]); } /* Pack the output. */ @@ -2776,7 +2518,6 @@ radv_emit_stream_output(struct radv_shader_context *ctx, static void radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) { - struct ac_build_if_state if_ctx; int i; /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ @@ -2796,7 +2537,7 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) * out-of-bounds buffer access. The hw tells us via the SGPR * (so_vtx_count) which threads are allowed to emit streamout data. */ - ac_nir_build_if(&if_ctx, ctx, can_emit); + ac_build_ifcc(&ctx->ac, can_emit, 6501); { /* The buffer offset is computed as follows: * ByteOffset = streamout_offset[buffer_id]*4 + @@ -2817,7 +2558,7 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) LLVMValueRef buf_ptr = ctx->streamout_buffers; for (i = 0; i < 4; i++) { - uint16_t stride = ctx->shader_info->info.so.strides[i]; + uint16_t stride = ctx->shader_info->so.strides[i]; if (!stride) continue; @@ -2841,114 +2582,113 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) } /* Write streamout data. */ - for (i = 0; i < ctx->shader_info->info.so.num_outputs; i++) { + for (i = 0; i < ctx->shader_info->so.num_outputs; i++) { + struct radv_shader_output_values shader_out = {}; struct radv_stream_output *output = - &ctx->shader_info->info.so.outputs[i]; + &ctx->shader_info->so.outputs[i]; if (stream != output->stream) continue; - radv_emit_stream_output(ctx, so_buffers, - so_write_offset, output); + for (int j = 0; j < 4; j++) { + shader_out.values[j] = + radv_load_output(ctx, output->location, j); + } + + radv_emit_stream_output(ctx, so_buffers,so_write_offset, + output, &shader_out); } } - ac_nir_build_endif(&if_ctx); + ac_build_endif(&ctx->ac, 6501); } static void -handle_vs_outputs_post(struct radv_shader_context *ctx, - bool export_prim_id, bool export_layer_id, - bool export_clip_dists, - struct radv_vs_output_info *outinfo) +radv_build_param_exports(struct radv_shader_context *ctx, + struct radv_shader_output_values *outputs, + unsigned noutput, + struct radv_vs_output_info *outinfo, + bool export_clip_dists) { - uint32_t param_count = 0; - unsigned target; - unsigned pos_idx, num_pos_exports = 0; - struct ac_export_args args, pos_args[4] = {}; - LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_index_value = NULL; - int i; - - if (ctx->options->key.has_multiview_view_index) { - LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; - if(!*tmp_out) { - for(unsigned i = 0; i < 4; ++i) - ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] = - ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); - } + unsigned param_count = 0; - LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, ctx->abi.view_index), *tmp_out); - ctx->output_mask |= 1ull << VARYING_SLOT_LAYER; - } + for (unsigned i = 0; i < noutput; i++) { + unsigned slot_name = outputs[i].slot_name; + unsigned usage_mask = outputs[i].usage_mask; - memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, - sizeof(outinfo->vs_output_param_offset)); + if (slot_name != VARYING_SLOT_LAYER && + slot_name != VARYING_SLOT_PRIMITIVE_ID && + slot_name != VARYING_SLOT_CLIP_DIST0 && + slot_name != VARYING_SLOT_CLIP_DIST1 && + slot_name < VARYING_SLOT_VAR0) + continue; - for(unsigned location = VARYING_SLOT_CLIP_DIST0; location <= VARYING_SLOT_CLIP_DIST1; ++location) { - if (ctx->output_mask & (1ull << location)) { - unsigned output_usage_mask, length; - LLVMValueRef slots[4]; - unsigned j; - - if (ctx->stage == MESA_SHADER_VERTEX && - !ctx->is_gs_copy_shader) { - output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[location]; - } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - output_usage_mask = - ctx->shader_info->info.tes.output_usage_mask[location]; - } else { - assert(ctx->is_gs_copy_shader); - output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[location]; - } + if ((slot_name == VARYING_SLOT_CLIP_DIST0 || + slot_name == VARYING_SLOT_CLIP_DIST1) && !export_clip_dists) + continue; - length = util_last_bit(output_usage_mask); + radv_export_param(ctx, param_count, outputs[i].values, usage_mask); - for (j = 0; j < length; j++) - slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, location, j)); + assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset)); + outinfo->vs_output_param_offset[slot_name] = param_count++; + } - for (i = length; i < 4; i++) - slots[i] = LLVMGetUndef(ctx->ac.f32); + outinfo->param_exports = param_count; +} - target = V_008DFC_SQ_EXP_POS + 2 + (location - VARYING_SLOT_CLIP_DIST0); - si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args); - memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); +/* Generate export instructions for hardware VS shader stage or NGG GS stage + * (position and parameter data only). + */ +static void +radv_llvm_export_vs(struct radv_shader_context *ctx, + struct radv_shader_output_values *outputs, + unsigned noutput, + struct radv_vs_output_info *outinfo, + bool export_clip_dists) +{ + LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL; + struct ac_export_args pos_args[4] = {}; + unsigned pos_idx, index; + int i; - if (export_clip_dists) { - /* Export the clip/cull distances values to the next stage. */ - radv_export_param(ctx, param_count, &slots[0], 0xf); - outinfo->vs_output_param_offset[location] = param_count++; - } + /* Build position exports */ + for (i = 0; i < noutput; i++) { + switch (outputs[i].slot_name) { + case VARYING_SLOT_POS: + si_llvm_init_export_args(ctx, outputs[i].values, 0xf, + V_008DFC_SQ_EXP_POS, &pos_args[0]); + break; + case VARYING_SLOT_PSIZ: + psize_value = outputs[i].values[0]; + break; + case VARYING_SLOT_LAYER: + layer_value = outputs[i].values[0]; + break; + case VARYING_SLOT_VIEWPORT: + viewport_value = outputs[i].values[0]; + break; + case VARYING_SLOT_CLIP_DIST0: + case VARYING_SLOT_CLIP_DIST1: + index = 2 + outputs[i].slot_index; + si_llvm_init_export_args(ctx, outputs[i].values, 0xf, + V_008DFC_SQ_EXP_POS + index, + &pos_args[index]); + break; + default: + break; } } - LLVMValueRef pos_values[4] = {ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_1}; - if (ctx->output_mask & (1ull << VARYING_SLOT_POS)) { - for (unsigned j = 0; j < 4; j++) - pos_values[j] = radv_load_output(ctx, VARYING_SLOT_POS, j); - } - si_llvm_init_export_args(ctx, pos_values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]); - - if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) { - outinfo->writes_pointsize = true; - psize_value = radv_load_output(ctx, VARYING_SLOT_PSIZ, 0); - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) { - outinfo->writes_layer = true; - layer_value = radv_load_output(ctx, VARYING_SLOT_LAYER, 0); - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) { - outinfo->writes_viewport_index = true; - viewport_index_value = radv_load_output(ctx, VARYING_SLOT_VIEWPORT, 0); - } - - if (ctx->shader_info->info.so.num_outputs && - !ctx->is_gs_copy_shader) { - /* The GS copy shader emission already emits streamout. */ - radv_emit_streamout(ctx, 0); + /* We need to add the position output manually if it's missing. */ + if (!pos_args[0].out[0]) { + pos_args[0].enabled_channels = 0xf; /* writemask */ + pos_args[0].valid_mask = 0; /* EXEC mask */ + pos_args[0].done = 0; /* last export? */ + pos_args[0].target = V_008DFC_SQ_EXP_POS; + pos_args[0].compr = 0; /* COMPR flag */ + pos_args[0].out[0] = ctx->ac.f32_0; /* X */ + pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[0].out[3] = ctx->ac.f32_1; /* W */ } if (outinfo->writes_pointsize || @@ -2974,7 +2714,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, /* GFX9 has the layer in out.z[10:0] and the viewport * index in out.z[19:16]. */ - LLVMValueRef v = viewport_index_value; + LLVMValueRef v = viewport_value; v = ac_to_integer(&ctx->ac, v); v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), @@ -2985,14 +2725,15 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, pos_args[1].out[2] = ac_to_float(&ctx->ac, v); pos_args[1].enabled_channels |= 1 << 2; } else { - pos_args[1].out[3] = viewport_index_value; + pos_args[1].out[3] = viewport_value; pos_args[1].enabled_channels |= 1 << 3; } } } + for (i = 0; i < 4; i++) { if (pos_args[i].out[0]) - num_pos_exports++; + outinfo->pos_exports++; } /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. @@ -3010,71 +2751,95 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, /* Specify the target we are exporting */ pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; - if (pos_idx == num_pos_exports) + + if (pos_idx == outinfo->pos_exports) + /* Specify that this is the last export */ pos_args[i].done = 1; + ac_build_export(&ctx->ac, &pos_args[i]); } + /* Build parameter exports */ + radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists); +} + +static void +handle_vs_outputs_post(struct radv_shader_context *ctx, + bool export_prim_id, + bool export_clip_dists, + struct radv_vs_output_info *outinfo) +{ + struct radv_shader_output_values *outputs; + unsigned noutput = 0; + + if (ctx->options->key.has_multiview_view_index) { + LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; + if(!*tmp_out) { + for(unsigned i = 0; i < 4; ++i) + ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] = + ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); + } + + LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, ctx->abi.view_index), *tmp_out); + ctx->output_mask |= 1ull << VARYING_SLOT_LAYER; + } + + memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(outinfo->vs_output_param_offset)); + outinfo->pos_exports = 0; + + if (ctx->shader_info->so.num_outputs && + !ctx->is_gs_copy_shader) { + /* The GS copy shader emission already emits streamout. */ + radv_emit_streamout(ctx, 0); + } + + /* Allocate a temporary array for the output values. */ + unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id; + outputs = malloc(num_outputs * sizeof(outputs[0])); + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - LLVMValueRef values[4]; if (!(ctx->output_mask & (1ull << i))) continue; - if (i != VARYING_SLOT_LAYER && - i != VARYING_SLOT_PRIMITIVE_ID && - i < VARYING_SLOT_VAR0) - continue; - - for (unsigned j = 0; j < 4; j++) - values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); - - unsigned output_usage_mask; + outputs[noutput].slot_name = i; + outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; if (ctx->stage == MESA_SHADER_VERTEX && !ctx->is_gs_copy_shader) { - output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[i]; + outputs[noutput].usage_mask = + ctx->shader_info->vs.output_usage_mask[i]; } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - output_usage_mask = - ctx->shader_info->info.tes.output_usage_mask[i]; + outputs[noutput].usage_mask = + ctx->shader_info->tes.output_usage_mask[i]; } else { assert(ctx->is_gs_copy_shader); - output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + outputs[noutput].usage_mask = + ctx->shader_info->gs.output_usage_mask[i]; } - radv_export_param(ctx, param_count, values, output_usage_mask); + for (unsigned j = 0; j < 4; j++) { + outputs[noutput].values[j] = + ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); + } - outinfo->vs_output_param_offset[i] = param_count++; + noutput++; } + /* Export PrimitiveID. */ if (export_prim_id) { - LLVMValueRef values[4]; - - values[0] = ctx->vs_prim_id; + outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID; + outputs[noutput].slot_index = 0; + outputs[noutput].usage_mask = 0x1; + outputs[noutput].values[0] = ctx->vs_prim_id; for (unsigned j = 1; j < 4; j++) - values[j] = ctx->ac.f32_0; - - radv_export_param(ctx, param_count, values, 0x1); - - outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++; - outinfo->export_prim_id = true; + outputs[noutput].values[j] = ctx->ac.f32_0; + noutput++; } - if (export_layer_id && layer_value) { - LLVMValueRef values[4]; - - values[0] = layer_value; - for (unsigned j = 1; j < 4; j++) - values[j] = ctx->ac.f32_0; - - radv_export_param(ctx, param_count, values, 0x1); - - outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = param_count++; - } + radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists); - outinfo->pos_exports = num_pos_exports; - outinfo->param_exports = param_count; + free(outputs); } static void @@ -3104,7 +2869,8 @@ handle_es_outputs_post(struct radv_shader_context *ctx, LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, LLVMBuildMul(ctx->ac.builder, wave_idx, - LLVMConstInt(ctx->ac.i32, 64, false), ""), ""); + LLVMConstInt(ctx->ac.i32, + ctx->ac.wave_size, false), ""), ""); lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), ""); } @@ -3120,11 +2886,11 @@ handle_es_outputs_post(struct radv_shader_context *ctx, if (ctx->stage == MESA_SHADER_VERTEX) { output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[i]; + ctx->shader_info->vs.output_usage_mask[i]; } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); output_usage_mask = - ctx->shader_info->info.tes.output_usage_mask[i]; + ctx->shader_info->tes.output_usage_mask[i]; } param_index = shader_io_get_unique_index(i); @@ -3166,7 +2932,7 @@ static void handle_ls_outputs_post(struct radv_shader_context *ctx) { LLVMValueRef vertex_id = ctx->rel_auto_id; - uint32_t num_tcs_inputs = util_last_bit64(ctx->shader_info->info.vs.ls_outputs_written); + uint32_t num_tcs_inputs = util_last_bit64(ctx->shader_info->vs.ls_outputs_written); LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false); LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, ""); @@ -3191,15 +2957,690 @@ handle_ls_outputs_post(struct radv_shader_context *ctx) } } -static void -write_tess_factors(struct radv_shader_context *ctx) +static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) { - unsigned stride, outer_comps, inner_comps; - struct ac_build_if_state if_ctx, inner_if_ctx; - LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5); - LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); - unsigned tess_inner_index = 0, tess_outer_index; - LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer; + return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); +} + +static LLVMValueRef get_tgsize(struct radv_shader_context *ctx) +{ + return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 28, 4); +} + +static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); + return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); +} + +static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + LLVMConstInt(ctx->ac.i32, 12, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + LLVMConstInt(ctx->ac.i32, 22, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef +ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) +{ + unsigned num_outputs = util_bitcount64(ctx->output_mask); + + LLVMTypeRef elements[2] = { + LLVMArrayType(ctx->ac.i32, 4 * num_outputs), + LLVMArrayType(ctx->ac.i8, 4), + }; + LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); + type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); + return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); +} + +/** + * Return a pointer to the LDS storage reserved for the N'th vertex, where N + * is in emit order; that is: + * - during the epilogue, N is the threadidx (relative to the entire threadgroup) + * - during vertex emit, i.e. while the API GS shader invocation is running, + * N = threadidx * gs_max_out_vertices + emitidx + * + * Goals of the LDS memory layout: + * 1. Eliminate bank conflicts on write for geometry shaders that have all emits + * in uniform control flow + * 2. Eliminate bank conflicts on read for export if, additionally, there is no + * culling + * 3. Agnostic to the number of waves (since we don't know it before compiling) + * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.) + * 5. Avoid wasting memory. + * + * We use an AoS layout due to point 4 (this also helps point 3). In an AoS + * layout, elimination of bank conflicts requires that each vertex occupy an + * odd number of dwords. We use the additional dword to store the output stream + * index as well as a flag to indicate whether this vertex ends a primitive + * for rasterization. + * + * Swizzling is required to satisfy points 1 and 2 simultaneously. + * + * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). + * Indices are swizzled in groups of 32, which ensures point 1 without + * disturbing point 2. + * + * \return an LDS pointer to type {[N x i32], [4 x i8]} + */ +static LLVMValueRef +ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); + + /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ + unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1; + if (write_stride_2exp) { + LLVMValueRef row = + LLVMBuildLShr(builder, vertexidx, + LLVMConstInt(ctx->ac.i32, 5, false), ""); + LLVMValueRef swizzle = + LLVMBuildAnd(builder, row, + LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, + false), ""); + vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); + } + + return ac_build_gep0(&ctx->ac, storage, vertexidx); +} + +static LLVMValueRef +ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, + LLVMValueRef emitidx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + + tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false); + tmp = LLVMBuildMul(builder, tmp, gsthread, ""); + const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); + return ngg_gs_vertex_ptr(ctx, vertexidx); +} + +/* Send GS Alloc Req message from the first wave of the group to SPI. + * Message payload is: + * - bits 0..10: vertices in group + * - bits 12..22: primitives in group + */ +static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx, + LLVMValueRef vtx_cnt, + LLVMValueRef prim_cnt) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5020); + + tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); + tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); + + ac_build_endif(&ctx->ac, 5020); +} + +struct ngg_prim { + unsigned num_vertices; + LLVMValueRef isnull; + LLVMValueRef index[3]; + LLVMValueRef edgeflag[3]; +}; + +static void build_export_prim(struct radv_shader_context *ctx, + const struct ngg_prim *prim) +{ + LLVMBuilderRef builder = ctx->ac.builder; + struct ac_export_args args; + LLVMValueRef tmp; + + tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); + args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); + + for (unsigned i = 0; i < prim->num_vertices; ++i) { + tmp = LLVMBuildShl(builder, prim->index[i], + LLVMConstInt(ctx->ac.i32, 10 * i, false), ""); + args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); + tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); + tmp = LLVMBuildShl(builder, tmp, + LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), ""); + args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); + } + + args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); + args.out[1] = LLVMGetUndef(ctx->ac.f32); + args.out[2] = LLVMGetUndef(ctx->ac.f32); + args.out[3] = LLVMGetUndef(ctx->ac.f32); + + args.target = V_008DFC_SQ_EXP_PRIM; + args.enabled_channels = 1; + args.done = true; + args.valid_mask = false; + args.compr = false; + + ac_build_export(&ctx->ac, &args); +} + +static void +handle_ngg_outputs_post(struct radv_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + unsigned num_vertices = 3; + LLVMValueRef tmp; + + assert((ctx->stage == MESA_SHADER_VERTEX || + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader); + + LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); + LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8); + LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), prims_in_wave, ""); + LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); + LLVMValueRef vtxindex[] = { + ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16), + ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16), + ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16), + }; + + /* TODO: streamout */ + + /* Copy Primitive IDs from GS threads to the LDS address corresponding + * to the ES thread of the provoking vertex. + */ + if (ctx->stage == MESA_SHADER_VERTEX && + ctx->options->key.vs_common_out.export_prim_id) { + /* TODO: streamout */ + + ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); + /* Extract the PROVOKING_VTX_INDEX field. */ + LLVMValueRef provoking_vtx_in_prim = + LLVMConstInt(ctx->ac.i32, 0, false); + + /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */ + LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3); + LLVMValueRef provoking_vtx_index = + LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); + + LLVMBuildStore(builder, ctx->abi.gs_prim_id, + ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index)); + ac_build_endif(&ctx->ac, 5400); + } + + /* TODO: primitive culling */ + + build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + + /* TODO: streamout queries */ + /* Export primitive data to the index buffer. Format is: + * - bits 0..8: index 0 + * - bit 9: edge flag 0 + * - bits 10..18: index 1 + * - bit 19: edge flag 1 + * - bits 20..28: index 2 + * - bit 29: edge flag 2 + * - bit 31: null primitive (skip) + * + * For the first version, we will always build up all three indices + * independent of the primitive type. The additional garbage data + * shouldn't hurt. + * + * TODO: culling depends on the primitive type, so can have some + * interaction here. + */ + ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); + { + struct ngg_prim prim = {}; + + prim.num_vertices = num_vertices; + prim.isnull = ctx->ac.i1false; + memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + + for (unsigned i = 0; i < num_vertices; ++i) { + tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, + LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); + prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + } + + build_export_prim(ctx, &prim); + } + ac_build_endif(&ctx->ac, 6001); + + /* Export per-vertex data (positions and parameters). */ + ac_build_ifcc(&ctx->ac, is_es_thread, 6002); + { + struct radv_vs_output_info *outinfo = + ctx->stage == MESA_SHADER_TESS_EVAL ? &ctx->shader_info->tes.outinfo : &ctx->shader_info->vs.outinfo; + + /* Exporting the primitive ID is handled below. */ + /* TODO: use the new VS export path */ + handle_vs_outputs_post(ctx, false, + ctx->options->key.vs_common_out.export_clip_dists, + outinfo); + + if (ctx->options->key.vs_common_out.export_prim_id) { + unsigned param_count = outinfo->param_exports; + LLVMValueRef values[4]; + + if (ctx->stage == MESA_SHADER_VERTEX) { + /* Wait for GS stores to finish. */ + ac_build_s_barrier(&ctx->ac); + + tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, + get_thread_id_in_tg(ctx)); + values[0] = LLVMBuildLoad(builder, tmp, ""); + } else { + assert(ctx->stage == MESA_SHADER_TESS_EVAL); + values[0] = ctx->abi.tes_patch_id; + } + + values[0] = ac_to_float(&ctx->ac, values[0]); + for (unsigned j = 1; j < 4; j++) + values[j] = ctx->ac.f32_0; + + radv_export_param(ctx, param_count, values, 0x1); + + outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++; + outinfo->param_exports = param_count; + } + } + ac_build_endif(&ctx->ac, 6002); +} + +static void gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx) +{ + /* Zero out the part of LDS scratch that is used to accumulate the + * per-stream generated primitive count. + */ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef scratchptr = ctx->gs_ngg_scratch; + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMBasicBlockRef merge_block; + LLVMValueRef cond; + + LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder)); + LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, ""); + merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); + LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block); + LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); + LLVMBuildStore(builder, ctx->ac.i32_0, ptr); + + LLVMBuildBr(ctx->ac.builder, merge_block); + LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block); + + ac_build_s_barrier(&ctx->ac); +} + +static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false); + LLVMValueRef tmp; + + /* Zero out remaining (non-emitted) primitive flags. + * + * Note: Alternatively, we could pass the relevant gs_next_vertex to + * the emit threads via LDS. This is likely worse in the expected + * typical case where each GS thread emits the full set of + * vertices. + */ + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + + num_components = + ctx->shader_info->gs.num_stream_output_components[stream]; + if (!num_components) + continue; + + const LLVMValueRef gsthread = get_thread_id_in_tg(ctx); + + ac_build_bgnloop(&ctx->ac, 5100); + + const LLVMValueRef vertexidx = + LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); + tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx, + LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5101); + ac_build_break(&ctx->ac); + ac_build_endif(&ctx->ac, 5101); + + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); + + tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second entry of struct */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); + LLVMBuildStore(builder, i8_0, tmp); + + ac_build_endloop(&ctx->ac, 5100); + } +} + +static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) +{ + const unsigned verts_per_prim = si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp, tmp2; + + ac_build_s_barrier(&ctx->ac); + + const LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); + + /* TODO: streamout */ + + /* TODO: culling */ + + /* Determine vertex liveness. */ + LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive"); + + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + ac_build_ifcc(&ctx->ac, tmp, 5120); + { + for (unsigned i = 0; i < verts_per_prim; ++i) { + const LLVMValueRef primidx = + LLVMBuildAdd(builder, tid, + LLVMConstInt(ctx->ac.i32, i, false), ""); + + if (i > 0) { + tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, ""); + ac_build_ifcc(&ctx->ac, tmp, 5121 + i); + } + + /* Load primitive liveness */ + tmp = ngg_gs_vertex_ptr(ctx, primidx); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implicit C-style array */ + ctx->ac.i32_1, /* second value of struct */ + ctx->ac.i32_0, /* stream 0 */ + }; + tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); + tmp = LLVMBuildLoad(builder, tmp, ""); + const LLVMValueRef primlive = + LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + + tmp = LLVMBuildLoad(builder, vertliveptr, ""); + tmp = LLVMBuildOr(builder, tmp, primlive, ""), + LLVMBuildStore(builder, tmp, vertliveptr); + + if (i > 0) + ac_build_endif(&ctx->ac, 5121 + i); + } + } + ac_build_endif(&ctx->ac, 5120); + + /* Inclusive scan addition across the current wave. */ + LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, ""); + struct ac_wg_scan vertlive_scan = {}; + vertlive_scan.op = nir_op_iadd; + vertlive_scan.enable_reduce = true; + vertlive_scan.enable_exclusive = true; + vertlive_scan.src = vertlive; + vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0); + vertlive_scan.waveidx = get_wave_id_in_tg(ctx); + vertlive_scan.numwaves = get_tgsize(ctx); + vertlive_scan.maxwaves = 8; + + ac_build_wg_scan(&ctx->ac, &vertlive_scan); + + /* Skip all exports (including index exports) when possible. At least on + * early gfx10 revisions this is also to avoid hangs. + */ + LLVMValueRef have_exports = + LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, ""); + num_emit_threads = + LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, ""); + + /* Allocate export space. Send this message as early as possible, to + * hide the latency of the SQ <-> SPI roundtrip. + * + * Note: We could consider compacting primitives for export as well. + * PA processes 1 non-null prim / clock, but it fetches 4 DW of + * prim data per clock and skips null primitives at no additional + * cost. So compacting primitives can only be beneficial when + * there are 4 or more contiguous null primitives in the export + * (in the common case of single-dword prim exports). + */ + build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads); + + /* Setup the reverse vertex compaction permutation. We re-use stream 1 + * of the primitive liveness flags, relying on the fact that each + * threadgroup can have at most 256 threads. */ + ac_build_ifcc(&ctx->ac, vertlive, 5130); + { + tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implicit C-style array */ + ctx->ac.i32_1, /* second value of struct */ + ctx->ac.i32_1, /* stream 1 */ + }; + tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); + tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); + LLVMBuildStore(builder, tmp2, tmp); + } + ac_build_endif(&ctx->ac, 5130); + + ac_build_s_barrier(&ctx->ac); + + /* Export primitive data */ + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + ac_build_ifcc(&ctx->ac, tmp, 5140); + { + struct ngg_prim prim = {}; + prim.num_vertices = verts_per_prim; + + tmp = ngg_gs_vertex_ptr(ctx, tid); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implicit C-style array */ + ctx->ac.i32_1, /* second value of struct */ + ctx->ac.i32_0, /* primflag */ + }; + tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); + tmp = LLVMBuildLoad(builder, tmp, ""); + prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp, + LLVMConstInt(ctx->ac.i8, 0, false), ""); + + for (unsigned i = 0; i < verts_per_prim; ++i) { + prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, + LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); + prim.edgeflag[i] = ctx->ac.i1false; + } + + build_export_prim(ctx, &prim); + } + ac_build_endif(&ctx->ac, 5140); + + /* Export position and parameter data */ + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); + ac_build_ifcc(&ctx->ac, tmp, 5145); + { + struct radv_vs_output_info *outinfo = &ctx->shader_info->vs.outinfo; + bool export_view_index = ctx->options->key.has_multiview_view_index; + struct radv_shader_output_values *outputs; + unsigned noutput = 0; + + /* Allocate a temporary array for the output values. */ + unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index; + outputs = calloc(num_outputs, sizeof(outputs[0])); + + memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(outinfo->vs_output_param_offset)); + outinfo->pos_exports = 0; + + tmp = ngg_gs_vertex_ptr(ctx, tid); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implicit C-style array */ + ctx->ac.i32_1, /* second value of struct */ + ctx->ac.i32_1, /* stream 1: source data index */ + }; + tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); + tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); + const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); + + unsigned out_idx = 0; + gep_idx[1] = ctx->ac.i32_0; + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + if (!(ctx->output_mask & (1ull << i))) + continue; + + outputs[noutput].slot_name = i; + outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; + + outputs[noutput].usage_mask = ctx->shader_info->gs.output_usage_mask[i]; + int length = util_last_bit(outputs[noutput].usage_mask); + + for (unsigned j = 0; j < length; j++, out_idx++) { + gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); + tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + tmp = LLVMBuildLoad(builder, tmp, ""); + + LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); + if (ac_get_type_size(type) == 2) { + tmp = ac_to_integer(&ctx->ac, tmp); + tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, ""); + } + + outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp); + } + + for (unsigned j = length; j < 4; j++) + outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32); + + noutput++; + } + + /* Export ViewIndex. */ + if (export_view_index) { + outputs[noutput].slot_name = VARYING_SLOT_LAYER; + outputs[noutput].slot_index = 0; + outputs[noutput].usage_mask = 0x1; + outputs[noutput].values[0] = ac_to_float(&ctx->ac, ctx->abi.view_index); + for (unsigned j = 1; j < 4; j++) + outputs[noutput].values[j] = ctx->ac.f32_0; + noutput++; + } + + radv_llvm_export_vs(ctx, outputs, noutput, outinfo, + ctx->options->key.vs_common_out.export_clip_dists); + FREE(outputs); + } + ac_build_endif(&ctx->ac, 5145); +} + +static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, + unsigned stream, + LLVMValueRef *addrs) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + const LLVMValueRef vertexidx = + LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); + + /* If this thread has already emitted the declared maximum number of + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + */ + const LLVMValueRef can_emit = + LLVMBuildICmp(builder, LLVMIntULT, vertexidx, + LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); + ac_build_kill_if_false(&ctx->ac, can_emit); + + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); + + const LLVMValueRef vertexptr = + ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); + unsigned out_idx = 0; + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->shader_info->gs.output_usage_mask[i]; + uint8_t output_stream = + ctx->shader_info->gs.output_streams[i]; + LLVMValueRef *out_ptr = &addrs[i * 4]; + int length = util_last_bit(output_usage_mask); + + if (!(ctx->output_mask & (1ull << i)) || + output_stream != stream) + continue; + + for (unsigned j = 0; j < length; j++, out_idx++) { + if (!(output_usage_mask & (1 << j))) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, + out_ptr[j], ""); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first entry of struct */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + + out_val = ac_to_integer(&ctx->ac, out_val); + out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); + + LLVMBuildStore(builder, out_val, ptr); + } + } + assert(out_idx * 4 <= ctx->gsvs_vertex_size); + + /* Determine and store whether this vertex completed a primitive. */ + const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); + + tmp = LLVMConstInt(ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false); + const LLVMValueRef iscompleteprim = + LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); + + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + const LLVMValueRef primflagptr = + LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + + tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); + LLVMBuildStore(builder, tmp, primflagptr); + + tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); + LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); +} + +static void +write_tess_factors(struct radv_shader_context *ctx) +{ + unsigned stride, outer_comps, inner_comps; + LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5); + LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + unsigned tess_inner_index = 0, tess_outer_index; + LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer; LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; int i; ac_emit_barrier(&ctx->ac, ctx->stage); @@ -3224,9 +3665,9 @@ write_tess_factors(struct radv_shader_context *ctx) return; } - ac_nir_build_if(&if_ctx, ctx, + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - invocation_id, ctx->ac.i32_0, "")); + invocation_id, ctx->ac.i32_0, ""), 6503); lds_base = get_tcs_out_current_patch_data_offset(ctx); @@ -3281,9 +3722,9 @@ write_tess_factors(struct radv_shader_context *ctx) unsigned tf_offset = 0; if (ctx->options->chip_class <= GFX8) { - ac_nir_build_if(&inner_if_ctx, ctx, + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - rel_patch_id, ctx->ac.i32_0, "")); + rel_patch_id, ctx->ac.i32_0, ""), 6504); /* Store the dynamic HS control word. */ ac_build_buffer_store_dword(&ctx->ac, buffer, @@ -3292,7 +3733,7 @@ write_tess_factors(struct radv_shader_context *ctx) 0, ac_glc, false); tf_offset += 4; - ac_nir_build_endif(&inner_if_ctx); + ac_build_endif(&ctx->ac, 6504); } /* Store the tessellation factors. */ @@ -3332,7 +3773,8 @@ write_tess_factors(struct radv_shader_context *ctx) ctx->oc_lds, 0, ac_glc, false); } } - ac_nir_build_endif(&if_ctx); + + ac_build_endif(&ctx->ac, 6503); } static void @@ -3395,15 +3837,15 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) } /* Process depth, stencil, samplemask. */ - if (ctx->shader_info->info.ps.writes_z) { + if (ctx->shader_info->ps.writes_z) { depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0)); } - if (ctx->shader_info->info.ps.writes_stencil) { + if (ctx->shader_info->ps.writes_stencil) { stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0)); } - if (ctx->shader_info->info.ps.writes_sample_mask) { + if (ctx->shader_info->ps.writes_sample_mask) { samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0)); } @@ -3412,9 +3854,9 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) * exported. */ if (index > 0 && - !ctx->shader_info->info.ps.writes_z && - !ctx->shader_info->info.ps.writes_stencil && - !ctx->shader_info->info.ps.writes_sample_mask) { + !ctx->shader_info->ps.writes_z && + !ctx->shader_info->ps.writes_stencil && + !ctx->shader_info->ps.writes_sample_mask) { unsigned last = index - 1; color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */ @@ -3434,6 +3876,14 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) static void emit_gs_epilogue(struct radv_shader_context *ctx) { + if (ctx->options->key.vs_common_out.as_ngg) { + gfx10_ngg_gs_emit_epilogue_1(ctx); + return; + } + + if (ctx->ac.chip_class >= GFX10) + LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id); } @@ -3445,14 +3895,15 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, switch (ctx->stage) { case MESA_SHADER_VERTEX: - if (ctx->options->key.vs.as_ls) + if (ctx->options->key.vs_common_out.as_ls) handle_ls_outputs_post(ctx); - else if (ctx->options->key.vs.as_es) + else if (ctx->options->key.vs_common_out.as_es) handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info); + else if (ctx->options->key.vs_common_out.as_ngg) + break; /* handled outside of the shader body */ else - handle_vs_outputs_post(ctx, ctx->options->key.vs.export_prim_id, - ctx->options->key.vs.export_layer_id, - ctx->options->key.vs.export_clip_dists, + handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, + ctx->options->key.vs_common_out.export_clip_dists, &ctx->shader_info->vs.outinfo); break; case MESA_SHADER_FRAGMENT: @@ -3465,12 +3916,13 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, handle_tcs_outputs_post(ctx); break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.tes.as_es) + if (ctx->options->key.vs_common_out.as_es) handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info); + else if (ctx->options->key.vs_common_out.as_ngg) + break; /* handled outside of the shader body */ else - handle_vs_outputs_post(ctx, ctx->options->key.tes.export_prim_id, - ctx->options->key.tes.export_layer_id, - ctx->options->key.tes.export_clip_dists, + handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, + ctx->options->key.vs_common_out.export_clip_dists, &ctx->shader_info->tes.outinfo); break; default: @@ -3500,13 +3952,13 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) case MESA_SHADER_GEOMETRY: return; case MESA_SHADER_VERTEX: - if (ctx->options->key.vs.as_ls || - ctx->options->key.vs.as_es) + if (ctx->options->key.vs_common_out.as_ls || + ctx->options->key.vs_common_out.as_es) return; outinfo = &ctx->shader_info->vs.outinfo; break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs.as_es) + if (ctx->options->key.vs_common_out.as_es) return; outinfo = &ctx->shader_info->tes.outinfo; break; @@ -3526,7 +3978,7 @@ ac_setup_rings(struct radv_shader_context *ctx) { if (ctx->options->chip_class <= GFX8 && (ctx->stage == MESA_SHADER_GEOMETRY || - ctx->options->key.vs.as_es || ctx->options->key.tes.as_es)) { + ctx->options->key.vs_common_out.as_es || ctx->options->key.vs_common_out.as_es)) { unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS; LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false); @@ -3554,7 +4006,7 @@ ac_setup_rings(struct radv_shader_context *ctx) */ LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); uint64_t stream_offset = 0; - unsigned num_records = 64; + unsigned num_records = ctx->ac.wave_size; LLVMValueRef base_ring; base_ring = @@ -3567,12 +4019,12 @@ ac_setup_rings(struct radv_shader_context *ctx) LLVMValueRef ring, tmp; num_components = - ctx->shader_info->info.gs.num_stream_output_components[stream]; + ctx->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; - stride = 4 * num_components * ctx->gs_max_out_vertices; + stride = 4 * num_components * ctx->shader->info.gs.vertices_out; /* Limit on the stride field for <= GFX7. */ assert(stride < (1 << 14)); @@ -3587,7 +4039,7 @@ ac_setup_rings(struct radv_shader_context *ctx) ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, ""); - stream_offset += stride * 64; + stream_offset += stride * ctx->ac.wave_size; ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, ""); @@ -3618,23 +4070,11 @@ ac_setup_rings(struct radv_shader_context *ctx) unsigned radv_nir_get_max_workgroup_size(enum chip_class chip_class, + gl_shader_stage stage, const struct nir_shader *nir) { - switch (nir->info.stage) { - case MESA_SHADER_TESS_CTRL: - return chip_class >= GFX7 ? 128 : 64; - case MESA_SHADER_GEOMETRY: - return chip_class >= GFX9 ? 128 : 64; - case MESA_SHADER_COMPUTE: - break; - default: - return 0; - } - - unsigned max_workgroup_size = nir->info.cs.local_size[0] * - nir->info.cs.local_size[1] * - nir->info.cs.local_size[2]; - return max_workgroup_size; + const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1}; + return radv_get_max_workgroup_size(chip_class, stage, nir ? nir->info.cs.local_size : backup_sizes); } /* Fixup the HW not emitting the TCS regs if there are no HS threads. */ @@ -3658,12 +4098,31 @@ static void prepare_gs_input_vgprs(struct radv_shader_context *ctx) ctx->gs_wave_id = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 16, 8); } +/* Ensure that the esgs ring is declared. + * + * We declare it with 64KB alignment as a hint that the + * pointer value will always be 0. + */ +static void declare_esgs_ring(struct radv_shader_context *ctx) +{ + if (ctx->esgs_ring) + return; + + assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); + + ctx->esgs_ring = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), + "esgs_ring", + AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); + LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); +} static LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, int shader_count, - struct radv_shader_variant_info *shader_info, + struct radv_shader_info *shader_info, const struct radv_nir_compiler_options *options) { struct radv_shader_context ctx = {0}; @@ -3671,24 +4130,20 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.options = options; ctx.shader_info = shader_info; - ac_llvm_context_init(&ctx.ac, options->chip_class, options->family); - ctx.context = ctx.ac.context; - ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context); - enum ac_float_mode float_mode = options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : AC_FLOAT_MODE_DEFAULT; - ctx.ac.builder = ac_create_builder(ctx.context, float_mode); - - memset(shader_info, 0, sizeof(*shader_info)); + ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, + options->family, float_mode, options->wave_size, 64); + ctx.context = ctx.ac.context; - radv_nir_shader_info_init(&shader_info->info); + radv_nir_shader_info_init(shader_info); for(int i = 0; i < shader_count; ++i) - radv_nir_shader_info_pass(shaders[i], options, &shader_info->info); + radv_nir_shader_info_pass(shaders[i], options, shader_info); - for (i = 0; i < RADV_UD_MAX_SETS; i++) + for (i = 0; i < MAX_SETS; i++) shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; for (i = 0; i < AC_UD_MAX_UD; i++) shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; @@ -3697,7 +4152,15 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, for (int i = 0; i < shader_count; ++i) { ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size, radv_nir_get_max_workgroup_size(ctx.options->chip_class, - shaders[i])); + shaders[i]->info.stage, + shaders[i])); + } + + if (ctx.ac.chip_class >= GFX10) { + if (is_pre_gs_stage(shaders[0]->info.stage) && + options->key.vs_common_out.as_ngg) { + ctx.max_workgroup_size = 128; + } } create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2, @@ -3711,24 +4174,19 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.load_sampler_desc = radv_get_sampler_desc; ctx.abi.load_resource = radv_load_resource; ctx.abi.clamp_shadow_reference = false; - ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800; + ctx.abi.robust_buffer_access = options->robust_buffer_access; - /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8, - * we fallback to the old intrinsics for atomic buffer image operations - * and thus we need to apply the indexing workaround... - */ - ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900; - - if (shader_count >= 2) + bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs_common_out.as_ngg; + if (shader_count >= 2 || is_ngg) ac_init_exec_full_mask(&ctx.ac); - if ((ctx.ac.family == CHIP_VEGA10 || - ctx.ac.family == CHIP_RAVEN) && + if (options->has_ls_vgpr_init_bug && shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); for(int i = 0; i < shader_count; ++i) { ctx.stage = shaders[i]->info.stage; + ctx.shader = shaders[i]; ctx.output_mask = 0; if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { @@ -3736,40 +4194,90 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out; + if (ctx.options->key.vs_common_out.as_ngg) { + for (unsigned i = 0; i < 4; ++i) { + ctx.gs_curprim_verts[i] = + ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); + ctx.gs_generated_prims[i] = + ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); + } + + /* TODO: streamout */ + + LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); + ctx.gs_ngg_scratch = + LLVMAddGlobalInAddressSpace(ctx.ac.module, + ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32)); + LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + + ctx.gs_ngg_emit = LLVMBuildIntToPtr(ctx.ac.builder, ctx.ac.i32_0, + LLVMPointerType(LLVMArrayType(ctx.ac.i32, 0), AC_ADDR_SPACE_LDS), + "ngg_emit"); + } + ctx.abi.load_inputs = load_gs_input; ctx.abi.emit_primitive = visit_end_primitive; } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { - ctx.tcs_outputs_read = shaders[i]->info.outputs_read; - ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read; ctx.abi.load_tess_varyings = load_tcs_varyings; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; ctx.abi.store_tcs_outputs = store_tcs_output; - ctx.tcs_vertices_per_patch = shaders[i]->info.tess.tcs_vertices_out; if (shader_count == 1) ctx.tcs_num_inputs = ctx.options->key.tcs.num_inputs; else - ctx.tcs_num_inputs = util_last_bit64(shader_info->info.vs.ls_outputs_written); + ctx.tcs_num_inputs = util_last_bit64(shader_info->vs.ls_outputs_written); ctx.tcs_num_patches = get_tcs_num_patches(&ctx); } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) { - ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode; ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; - ctx.tcs_vertices_per_patch = shaders[i]->info.tess.tcs_vertices_out; ctx.tcs_num_patches = ctx.options->key.tes.num_patches; } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) { ctx.abi.load_base_vertex = radv_load_base_vertex; } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { - shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard; - ctx.abi.lookup_interp_param = lookup_interp_param; ctx.abi.load_sample_position = load_sample_position; ctx.abi.load_sample_mask_in = load_sample_mask_in; ctx.abi.emit_kill = radv_emit_kill; } - if (i) + if (shaders[i]->info.stage == MESA_SHADER_VERTEX && + ctx.options->key.vs_common_out.as_ngg && + ctx.options->key.vs_common_out.export_prim_id) { + declare_esgs_ring(&ctx); + } + + bool nested_barrier = false; + + if (i) { + if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && + ctx.options->key.vs_common_out.as_ngg) { + gfx10_ngg_gs_emit_prologue(&ctx); + nested_barrier = false; + } else { + nested_barrier = true; + } + } + + if (nested_barrier) { + /* Execute a barrier before the second shader in + * a merged shader. + * + * Execute the barrier inside the conditional block, + * so that empty waves can jump directly to s_endpgm, + * which will also signal the barrier. + * + * This is possible in gfx9, because an empty wave + * for the second shader does not participate in + * the epilogue. With NGG, empty waves may still + * be required to export data (e.g. GS output vertices), + * so we cannot let them exit early. + * + * If the shader is TCS and the TCS epilog is present + * and contains a barrier, it will wait there and then + * reach s_endpgm. + */ ac_emit_barrier(&ctx.ac, ctx.stage); + } nir_foreach_variable(variable, &shaders[i]->outputs) scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage); @@ -3785,7 +4293,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_setup_rings(&ctx); LLVMBasicBlockRef merge_block; - if (shader_count >= 2) { + if (shader_count >= 2 || is_ngg) { LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); @@ -3800,7 +4308,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, } if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) - handle_fs_inputs(&ctx, shaders[i]); + prepare_interp_optimize(&ctx, shaders[i]); else if(shaders[i]->info.stage == MESA_SHADER_VERTEX) handle_vs_inputs(&ctx, shaders[i]); else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY) @@ -3808,11 +4316,22 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]); - if (shader_count >= 2) { + if (shader_count >= 2 || is_ngg) { LLVMBuildBr(ctx.ac.builder, merge_block); LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block); } + /* This needs to be outside the if wrapping the shader body, as sometimes + * the HW generates waves with 0 es/vs threads. */ + if (is_pre_gs_stage(shaders[i]->info.stage) && + ctx.options->key.vs_common_out.as_ngg && + i == shader_count - 1) { + handle_ngg_outputs_post(&ctx); + } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && + ctx.options->key.vs_common_out.as_ngg) { + gfx10_ngg_gs_emit_epilogue_2(&ctx); + } + if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { shader_info->gs.gsvs_vertex_size = ctx.gsvs_vertex_size; shader_info->gs.max_gsvs_emit_size = ctx.max_gsvs_emit_size; @@ -3824,8 +4343,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, LLVMBuildRetVoid(ctx.ac.builder); - if (options->dump_preoptir) + if (options->dump_preoptir) { + fprintf(stderr, "%s LLVM IR:\n\n", + radv_get_shader_name(shader_info, + shaders[shader_count - 1]->info.stage)); ac_dump_module(ctx.ac.module); + fprintf(stderr, "\n"); + } ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); @@ -3877,15 +4401,19 @@ static unsigned radv_llvm_compile(LLVMModuleRef M, static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, gl_shader_stage stage, + const char *name, const struct radv_nir_compiler_options *options) { char *elf_buffer = NULL; size_t elf_size = 0; char *llvm_ir_string = NULL; - if (options->dump_shader) + + if (options->dump_shader) { + fprintf(stderr, "%s LLVM IR:\n\n", name); ac_dump_module(llvm_module); + fprintf(stderr, "\n"); + } if (options->record_llvm_ir) { char *llvm_ir = LLVMPrintModuleToString(llvm_module); @@ -3920,48 +4448,10 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, free(elf_buffer); } -static void -ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_shader *nir, const struct radv_nir_compiler_options *options) -{ - switch (nir->info.stage) { - case MESA_SHADER_COMPUTE: - for (int i = 0; i < 3; ++i) - shader_info->cs.block_size[i] = nir->info.cs.local_size[i]; - break; - case MESA_SHADER_FRAGMENT: - shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests; - break; - case MESA_SHADER_GEOMETRY: - shader_info->gs.vertices_in = nir->info.gs.vertices_in; - shader_info->gs.vertices_out = nir->info.gs.vertices_out; - shader_info->gs.output_prim = nir->info.gs.output_primitive; - shader_info->gs.invocations = nir->info.gs.invocations; - break; - case MESA_SHADER_TESS_EVAL: - shader_info->tes.primitive_mode = nir->info.tess.primitive_mode; - shader_info->tes.spacing = nir->info.tess.spacing; - shader_info->tes.ccw = nir->info.tess.ccw; - shader_info->tes.point_mode = nir->info.tess.point_mode; - shader_info->tes.as_es = options->key.tes.as_es; - shader_info->tes.export_prim_id = options->key.tes.export_prim_id; - break; - case MESA_SHADER_TESS_CTRL: - shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out; - break; - case MESA_SHADER_VERTEX: - shader_info->vs.as_es = options->key.vs.as_es; - shader_info->vs.as_ls = options->key.vs.as_ls; - shader_info->vs.export_prim_id = options->key.vs.export_prim_id; - break; - default: - break; - } -} - void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, + struct radv_shader_info *shader_info, struct nir_shader *const *nir, int nir_count, const struct radv_nir_compiler_options *options) @@ -3972,19 +4462,20 @@ radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info, options); - ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, shader_info, - nir[nir_count - 1]->info.stage, options); - - for (int i = 0; i < nir_count; ++i) - ac_fill_shader_info(shader_info, nir[i], options); + ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, + nir[nir_count - 1]->info.stage, + radv_get_shader_name(shader_info, + nir[nir_count - 1]->info.stage), + options); /* Determine the ES type (VS or TES) for the GS on GFX9. */ - if (options->chip_class == GFX9) { + if (options->chip_class >= GFX9) { if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) { shader_info->gs.es_type = nir[0]->info.stage; } } + shader_info->wave_size = options->wave_size; } static void @@ -3996,7 +4487,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMValueRef stream_id; /* Fetch the vertex stream ID. */ - if (ctx->shader_info->info.so.num_outputs) { + if (ctx->shader_info->so.num_outputs) { stream_id = ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2); } else { @@ -4012,14 +4503,14 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) for (unsigned stream = 0; stream < 4; stream++) { unsigned num_components = - ctx->shader_info->info.gs.num_stream_output_components[stream]; + ctx->shader_info->gs.num_stream_output_components[stream]; LLVMBasicBlockRef bb; unsigned offset; if (!num_components) continue; - if (stream > 0 && !ctx->shader_info->info.so.num_outputs) + if (stream > 0 && !ctx->shader_info->so.num_outputs) continue; bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out"); @@ -4029,9 +4520,9 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) offset = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[i]; + ctx->shader_info->gs.output_usage_mask[i]; unsigned output_stream = - ctx->shader_info->info.gs.output_streams[i]; + ctx->shader_info->gs.output_streams[i]; int length = util_last_bit(output_usage_mask); if (!(ctx->output_mask & (1ull << i)) || @@ -4046,7 +4537,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) soffset = LLVMConstInt(ctx->ac.i32, offset * - ctx->gs_max_out_vertices * 16 * 4, false); + ctx->shader->info.gs.vertices_out * 16 * 4, false); offset++; @@ -4067,11 +4558,11 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) } } - if (ctx->shader_info->info.so.num_outputs) + if (ctx->shader_info->so.num_outputs) radv_emit_streamout(ctx, stream); if (stream == 0) { - handle_vs_outputs_post(ctx, false, false, true, + handle_vs_outputs_post(ctx, false, true, &ctx->shader_info->vs.outinfo); } @@ -4085,31 +4576,29 @@ void radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct radv_shader_binary **rbinary, - struct radv_shader_variant_info *shader_info, + struct radv_shader_info *shader_info, const struct radv_nir_compiler_options *options) { struct radv_shader_context ctx = {0}; ctx.options = options; ctx.shader_info = shader_info; - ac_llvm_context_init(&ctx.ac, options->chip_class, options->family); - ctx.context = ctx.ac.context; - ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context); - - ctx.is_gs_copy_shader = true; - enum ac_float_mode float_mode = options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : AC_FLOAT_MODE_DEFAULT; - ctx.ac.builder = ac_create_builder(ctx.context, float_mode); + ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, + options->family, float_mode, 64, 64); + ctx.context = ctx.ac.context; + + ctx.is_gs_copy_shader = true; ctx.stage = MESA_SHADER_VERTEX; + ctx.shader = geom_shader; - radv_nir_shader_info_pass(geom_shader, options, &shader_info->info); + radv_nir_shader_info_pass(geom_shader, options, shader_info); create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX); - ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out; ac_setup_rings(&ctx); nir_foreach_variable(variable, &geom_shader->outputs) { @@ -4124,8 +4613,8 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); - ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, shader_info, - MESA_SHADER_VERTEX, options); + ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, + MESA_SHADER_VERTEX, "GS Copy Shader", options); (*rbinary)->is_gs_copy_shader = true; }