X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=126251193b19a6e5298158e3ccca5eed8779229e;hb=e19d1ee2d1f0567512c831d02fafb625bbbddbd8;hp=9574330a4da129c923f3a38693c73703e45c65bc;hpb=a99d2d5564fbb817d8a3fc4c5f6a551c8659b453;p=mesa.git diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 9574330a4da..126251193b1 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -108,8 +108,6 @@ struct radv_shader_context { uint32_t tcs_num_inputs; uint32_t tcs_num_patches; - uint32_t max_gsvs_emit_size; - uint32_t gsvs_vertex_size; LLVMValueRef vertexptr; /* GFX10 only */ }; @@ -651,7 +649,7 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, if (ctx->shader_info->loads_push_constants) user_sgpr_count++; - if (ctx->streamout_buffers) + if (ctx->shader_info->so.num_outputs) user_sgpr_count++; uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; @@ -773,6 +771,9 @@ declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, { int i; + if (ctx->options->use_ngg_streamout) + return; + /* Streamout SGPRs. */ if (ctx->shader_info->so.num_outputs) { assert(stage == MESA_SHADER_VERTEX || @@ -1600,6 +1601,18 @@ load_tes_input(struct ac_shader_abi *abi, return result; } +static LLVMValueRef +radv_emit_fetch_64bit(struct radv_shader_context *ctx, + LLVMTypeRef type, LLVMValueRef a, LLVMValueRef b) +{ + LLVMValueRef values[2] = { + ac_to_integer(&ctx->ac, a), + ac_to_integer(&ctx->ac, b), + }; + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); + return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); +} + static LLVMValueRef load_gs_input(struct ac_shader_abi *abi, unsigned location, @@ -1628,6 +1641,14 @@ load_gs_input(struct ac_shader_abi *abi, dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), ""); value[i] = ac_lds_load(&ctx->ac, dw_addr); + + if (ac_get_type_size(type) == 8) { + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, + LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index + 1, 0), ""); + LLVMValueRef tmp = ac_lds_load(&ctx->ac, dw_addr); + + value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp); + } } else { LLVMValueRef soffset = LLVMConstInt(ctx->ac.i32, @@ -1639,6 +1660,21 @@ load_gs_input(struct ac_shader_abi *abi, ctx->ac.i32_0, vtx_offset, soffset, 0, ac_glc, true, false); + + if (ac_get_type_size(type) == 8) { + soffset = LLVMConstInt(ctx->ac.i32, + (param * 4 + i + const_index + 1) * 256, + false); + + LLVMValueRef tmp = + ac_build_buffer_load(&ctx->ac, + ctx->esgs_ring, 1, + ctx->ac.i32_0, + vtx_offset, soffset, + 0, ac_glc, true, false); + + value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp); + } } if (ac_get_type_size(type) == 2) { @@ -1760,13 +1796,17 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr ""); /* If this thread has already emitted the declared maximum number of - * vertices, kill it: excessive vertex emissions are not supposed to - * have any effect, and GS threads have no externally observable - * effects other than emitting vertices. + * vertices, don't emit any more: excessive vertex emissions are not + * supposed to have any effect. */ can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - ac_build_kill_if_false(&ctx->ac, can_emit); + + bool use_kill = !ctx->shader_info->gs.writes_memory; + if (use_kill) + ac_build_kill_if_false(&ctx->ac, can_emit); + else + ac_build_ifcc(&ctx->ac, can_emit, 6505); for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = @@ -1813,6 +1853,9 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); + + if (!use_kill) + ac_build_endif(&ctx->ac, 6505); } static void @@ -2788,19 +2831,8 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, sizeof(outinfo->vs_output_param_offset)); outinfo->pos_exports = 0; - if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) { - outinfo->writes_pointsize = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) { - outinfo->writes_layer = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) { - outinfo->writes_viewport_index = true; - } - - if (ctx->shader_info->so.num_outputs && + if (!ctx->options->use_ngg_streamout && + ctx->shader_info->so.num_outputs && !ctx->is_gs_copy_shader) { /* The GS copy shader emission already emits streamout. */ radv_emit_streamout(ctx, 0); @@ -2840,8 +2872,6 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, /* Export PrimitiveID. */ if (export_prim_id) { - outinfo->export_prim_id = true; - outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; @@ -2861,22 +2891,8 @@ handle_es_outputs_post(struct radv_shader_context *ctx, struct radv_es_output_info *outinfo) { int j; - uint64_t max_output_written = 0; LLVMValueRef lds_base = NULL; - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - int param_index; - - if (!(ctx->output_mask & (1ull << i))) - continue; - - param_index = shader_io_get_unique_index(i); - - max_output_written = MAX2(param_index, max_output_written); - } - - outinfo->esgs_itemsize = (max_output_written + 1) * 16; - if (ctx->ac.chip_class >= GFX9) { unsigned itemsize_dw = outinfo->esgs_itemsize / 4; LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); @@ -3006,11 +3022,22 @@ static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) false); } +static LLVMValueRef ngg_get_ordered_id(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + ctx->ac.i32_0, + LLVMConstInt(ctx->ac.i32, 11, false), + false); +} + static LLVMValueRef ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) { unsigned num_outputs = util_bitcount64(ctx->output_mask); + if (ctx->options->key.has_multiview_view_index) + num_outputs++; + LLVMTypeRef elements[2] = { LLVMArrayType(ctx->ac.i32, 4 * num_outputs), LLVMArrayType(ctx->ac.i8, 4), @@ -3148,11 +3175,511 @@ static void build_export_prim(struct radv_shader_context *ctx, ac_build_export(&ctx->ac, &args); } +static struct radv_stream_output * +radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location) +{ + for (unsigned i = 0; i < so->num_outputs; ++i) { + if (so->outputs[i].location == location) + return &so->outputs[i]; + } + + return NULL; +} + +static void build_streamout_vertex(struct radv_shader_context *ctx, + LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw, + unsigned stream, LLVMValueRef offset_vtx, + LLVMValueRef vertexptr) +{ + struct radv_streamout_info *so = &ctx->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset[4] = {}; + LLVMValueRef tmp; + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (!wg_offset_dw[buffer]) + continue; + + tmp = LLVMBuildMul(builder, offset_vtx, + LLVMConstInt(ctx->ac.i32, so->strides[buffer], false), ""); + tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, ""); + offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); + } + + if (ctx->stage == MESA_SHADER_GEOMETRY) { + struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS]; + unsigned noutput = 0; + unsigned out_idx = 0; + + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->shader_info->gs.output_usage_mask[i]; + uint8_t output_stream = + output_stream = ctx->shader_info->gs.output_streams[i]; + + if (!(ctx->output_mask & (1ull << i)) || + output_stream != stream) + continue; + + outputs[noutput].slot_name = i; + outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; + outputs[noutput].usage_mask = output_usage_mask; + + int length = util_last_bit(output_usage_mask); + + for (unsigned j = 0; j < length; j++, out_idx++) { + if (!(output_usage_mask & (1 << j))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertexptr, + LLVMConstInt(ctx->ac.i32, out_idx, false)); + outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, ""); + } + + for (unsigned j = length; j < 4; j++) + outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32); + + noutput++; + } + + for (unsigned i = 0; i < noutput; i++) { + struct radv_stream_output *output = + radv_get_stream_output_by_loc(so, outputs[i].slot_name); + + if (!output || + output->stream != stream) + continue; + + struct radv_shader_output_values out = {}; + + for (unsigned j = 0; j < 4; j++) { + out.values[j] = outputs[i].values[j]; + } + + radv_emit_stream_output(ctx, so_buffer, offset, output, &out); + } + } else { + for (unsigned i = 0; i < so->num_outputs; ++i) { + struct radv_stream_output *output = + &ctx->shader_info->so.outputs[i]; + + if (stream != output->stream) + continue; + + struct radv_shader_output_values out = {}; + + for (unsigned comp = 0; comp < 4; comp++) { + if (!(output->component_mask & (1 << comp))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertexptr, + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + out.values[comp] = LLVMBuildLoad(builder, tmp, ""); + } + + radv_emit_stream_output(ctx, so_buffer, offset, output, &out); + } + } +} + +struct ngg_streamout { + LLVMValueRef num_vertices; + + /* per-thread data */ + LLVMValueRef prim_enable[4]; /* i1 per stream */ + LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */ + + /* Output */ + LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */ +}; + +/** + * Build streamout logic. + * + * Implies a barrier. + * + * Writes number of emitted primitives to gs_ngg_scratch[4:7]. + * + * Clobbers gs_ngg_scratch[8:]. + */ +static void build_streamout(struct radv_shader_context *ctx, + struct ngg_streamout *nggso) +{ + struct radv_streamout_info *so = &ctx->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef buf_ptr = ctx->streamout_buffers; + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef cond, tmp, tmp2; + LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false); + LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false); + LLVMValueRef so_buffer[4] = {}; + unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + + (nggso->vertices[2] ? 1 : 0); + LLVMValueRef prim_stride_dw[4] = {}; + LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32); + int stream_for_buffer[4] = { -1, -1, -1, -1 }; + unsigned bufmask_for_stream[4] = {}; + bool isgs = ctx->stage == MESA_SHADER_GEOMETRY; + unsigned scratch_emit_base = isgs ? 4 : 0; + LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0; + unsigned scratch_offset_base = isgs ? 8 : 4; + LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + /* Determine the mapping of streamout buffers to vertex streams. */ + for (unsigned i = 0; i < so->num_outputs; ++i) { + unsigned buf = so->outputs[i].buffer; + unsigned stream = so->outputs[i].stream; + assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream); + stream_for_buffer[buf] = stream; + bufmask_for_stream[stream] |= 1 << buf; + } + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + assert(so->strides[buffer]); + + LLVMValueRef stride_for_buffer = + LLVMConstInt(ctx->ac.i32, so->strides[buffer], false); + prim_stride_dw[buffer] = + LLVMBuildMul(builder, stride_for_buffer, + nggso->num_vertices, ""); + prim_stride_dw_vgpr = ac_build_writelane( + &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer], + LLVMConstInt(ctx->ac.i32, buffer, false)); + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false); + so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + offset); + } + + cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, cond, 5200); + { + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + /* Advance the streamout offsets in GDS. */ + LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, cond, 5210); + { + /* Fetch the number of generated primitives and store + * it in GDS for later use. + */ + if (isgs) { + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid); + tmp = LLVMBuildLoad(builder, tmp, ""); + } else { + tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, + ngg_get_prim_cnt(ctx), ctx->ac.i32_0); + } + LLVMBuildStore(builder, tmp, generated_by_stream_vgpr); + + unsigned swizzle[4]; + int unused_stream = -1; + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->shader_info->gs.num_stream_output_components[stream]) { + unused_stream = stream; + break; + } + } + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + swizzle[buffer] = stream_for_buffer[buffer]; + } else { + assert(unused_stream >= 0); + swizzle[buffer] = unused_stream; + } + } + + tmp = ac_build_quad_swizzle(&ctx->ac, tmp, + swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + + LLVMValueRef args[] = { + LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), + tmp, + ctx->ac.i32_0, // ordering + ctx->ac.i32_0, // scope + ctx->ac.i1false, // isVolatile + LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index + ctx->ac.i1true, // wave release + ctx->ac.i1true, // wave done + }; + + tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", + ctx->ac.i32, args, ARRAY_SIZE(args), 0); + + /* Keep offsets in a VGPR for quick retrieval via readlane by + * the first wave for bounds checking, and also store in LDS + * for retrieval by all waves later. */ + LLVMBuildStore(builder, tmp, offsets_vgpr); + + tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), + scratch_offset_basev, ""); + tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2); + LLVMBuildStore(builder, tmp, tmp2); + } + ac_build_endif(&ctx->ac, 5210); + + /* Determine the max emit per buffer. This is done via the SALU, in part + * because LLVM can't generate divide-by-multiply if we try to do this + * via VALU with one lane per buffer. + */ + LLVMValueRef max_emit[4] = {}; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + /* Compute the streamout buffer size in DWORD. */ + LLVMValueRef bufsize_dw = + LLVMBuildLShr(builder, + LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), + i32_2, ""); + + /* Load the streamout buffer offset from GDS. */ + tmp = LLVMBuildLoad(builder, offsets_vgpr, ""); + LLVMValueRef offset_dw = + ac_build_readlane(&ctx->ac, tmp, + LLVMConstInt(ctx->ac.i32, buffer, false)); + + /* Compute the remaining size to emit. */ + LLVMValueRef remaining_dw = + LLVMBuildSub(builder, bufsize_dw, offset_dw, ""); + tmp = LLVMBuildUDiv(builder, remaining_dw, + prim_stride_dw[buffer], ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, + bufsize_dw, offset_dw, ""); + max_emit[buffer] = LLVMBuildSelect(builder, cond, + ctx->ac.i32_0, tmp, ""); + } + + /* Determine the number of emitted primitives per stream and fixup the + * GDS counter if necessary. + * + * This is complicated by the fact that a single stream can emit to + * multiple buffers (but luckily not vice versa). + */ + LLVMValueRef emit_vgpr = ctx->ac.i32_0; + + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->shader_info->gs.num_stream_output_components[stream]) + continue; + + /* Load the number of generated primitives from GDS and + * determine that number for the given stream. + */ + tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, ""); + LLVMValueRef generated = + ac_build_readlane(&ctx->ac, tmp, + LLVMConstInt(ctx->ac.i32, stream, false)); + + + /* Compute the number of emitted primitives. */ + LLVMValueRef emit = generated; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == stream) + emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]); + } + + /* Store the number of emitted primitives for that + * stream. + */ + emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit, + LLVMConstInt(ctx->ac.i32, stream, false)); + + /* Fixup the offset using a plain GDS atomic if we overflowed. */ + cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, ""); + ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */ + tmp = LLVMBuildLShr(builder, + LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false), + ac_get_thread_id(&ctx->ac), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5222); + { + tmp = LLVMBuildSub(builder, generated, emit, ""); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, ""); + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp, + LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5222); + ac_build_endif(&ctx->ac, 5221); + } + + /* Store the number of emitted primitives to LDS for later use. */ + cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, cond, 5225); + { + tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), + scratch_emit_basev, ""); + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp); + LLVMBuildStore(builder, emit_vgpr, tmp); + } + ac_build_endif(&ctx->ac, 5225); + } + ac_build_endif(&ctx->ac, 5200); + + /* Determine the workgroup-relative per-thread / primitive offset into + * the streamout buffers */ + struct ac_wg_scan primemit_scan[4] = {}; + + if (isgs) { + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->shader_info->gs.num_stream_output_components[stream]) + continue; + + primemit_scan[stream].enable_exclusive = true; + primemit_scan[stream].op = nir_op_iadd; + primemit_scan[stream].src = nggso->prim_enable[stream]; + primemit_scan[stream].scratch = + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false)); + primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx); + primemit_scan[stream].numwaves = get_tgsize(ctx); + primemit_scan[stream].maxwaves = 8; + ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]); + } + } + + ac_build_s_barrier(&ctx->ac); + + /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ + LLVMValueRef wgoffset_dw[4] = {}; + + { + LLVMValueRef scratch_vgpr; + + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac)); + scratch_vgpr = LLVMBuildLoad(builder, tmp, ""); + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + wgoffset_dw[buffer] = ac_build_readlane( + &ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false)); + } + } + + for (unsigned stream = 0; stream < 4; ++stream) { + if (ctx->shader_info->gs.num_stream_output_components[stream]) { + nggso->emit[stream] = ac_build_readlane( + &ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); + } + } + } + + /* Write out primitive data */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->shader_info->gs.num_stream_output_components[stream]) + continue; + + if (isgs) { + ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]); + } else { + primemit_scan[stream].result_exclusive = tid; + } + + cond = LLVMBuildICmp(builder, LLVMIntULT, + primemit_scan[stream].result_exclusive, + nggso->emit[stream], ""); + cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], ""); + ac_build_ifcc(&ctx->ac, cond, 5240); + { + LLVMValueRef offset_vtx = + LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, + nggso->num_vertices, ""); + + for (unsigned i = 0; i < max_num_vertices; ++i) { + cond = LLVMBuildICmp(builder, LLVMIntULT, + LLVMConstInt(ctx->ac.i32, i, false), + nggso->num_vertices, ""); + ac_build_ifcc(&ctx->ac, cond, 5241); + build_streamout_vertex(ctx, so_buffer, wgoffset_dw, + stream, offset_vtx, nggso->vertices[i]); + ac_build_endif(&ctx->ac, 5241); + offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, ""); + } + } + ac_build_endif(&ctx->ac, 5240); + } +} + +static unsigned ngg_nogs_vertex_size(struct radv_shader_context *ctx) +{ + unsigned lds_vertex_size = 0; + + if (ctx->shader_info->so.num_outputs) + lds_vertex_size = 4 * ctx->shader_info->so.num_outputs + 1; + + return lds_vertex_size; +} + +/** + * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage + * for the vertex outputs. + */ +static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, + LLVMValueRef vtxid) +{ + /* The extra dword is used to avoid LDS bank conflicts. */ + unsigned vertex_size = ngg_nogs_vertex_size(ctx); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size); + LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, ""); + return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); +} + +static void +handle_ngg_outputs_post_1(struct radv_shader_context *ctx) +{ + struct radv_streamout_info *so = &ctx->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef vertex_ptr = NULL; + LLVMValueRef tmp, tmp2; + + assert((ctx->stage == MESA_SHADER_VERTEX || + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader); + + if (!ctx->shader_info->so.num_outputs) + return; + + vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + + for (unsigned i = 0; i < so->num_outputs; ++i) { + struct radv_stream_output *output = + &ctx->shader_info->so.outputs[i]; + + unsigned loc = output->location; + + for (unsigned comp = 0; comp < 4; comp++) { + if (!(output->component_mask & (1 << comp))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + tmp2 = LLVMBuildLoad(builder, + ctx->abi.outputs[4 * loc + comp], ""); + tmp2 = ac_to_integer(&ctx->ac, tmp2); + LLVMBuildStore(builder, tmp2, tmp); + } + } +} + static void -handle_ngg_outputs_post(struct radv_shader_context *ctx) +handle_ngg_outputs_post_2(struct radv_shader_context *ctx) { LLVMBuilderRef builder = ctx->ac.builder; - unsigned num_vertices = 3; LLVMValueRef tmp; assert((ctx->stage == MESA_SHADER_VERTEX || @@ -3170,14 +3697,50 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16), }; - /* TODO: streamout */ + /* Determine the number of vertices per primitive. */ + unsigned num_vertices; + LLVMValueRef num_vertices_val; + + if (ctx->stage == MESA_SHADER_VERTEX) { + LLVMValueRef outprim_val = + LLVMConstInt(ctx->ac.i32, + ctx->options->key.vs.outprim, false); + num_vertices_val = LLVMBuildAdd(builder, outprim_val, + ctx->ac.i32_1, ""); + num_vertices = 3; /* TODO: optimize for points & lines */ + } else { + assert(ctx->stage == MESA_SHADER_TESS_EVAL); + + if (ctx->shader->info.tess.point_mode) + num_vertices = 1; + else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES) + num_vertices = 2; + else + num_vertices = 3; + + num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false); + } + + /* Streamout */ + if (ctx->shader_info->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = num_vertices_val; + nggso.prim_enable[0] = is_gs_thread; + + for (unsigned i = 0; i < num_vertices; ++i) + nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); + + build_streamout(ctx, &nggso); + } /* Copy Primitive IDs from GS threads to the LDS address corresponding * to the ES thread of the provoking vertex. */ if (ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs_common_out.export_prim_id) { - /* TODO: streamout */ + if (ctx->shader_info->so.num_outputs) + ac_build_s_barrier(&ctx->ac); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Extract the PROVOKING_VTX_INDEX field. */ @@ -3268,7 +3831,6 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) radv_export_param(ctx, param_count, values, 0x1); outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++; - outinfo->export_prim_id = true; outinfo->param_exports = param_count; } } @@ -3350,6 +3912,30 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) ac_build_endloop(&ctx->ac, 5100); } + + /* Accumulate generated primitives counts across the entire threadgroup. */ + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + + num_components = + ctx->shader_info->gs.num_stream_output_components[stream]; + if (!num_components) + continue; + + LLVMValueRef numprims = + LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5105); + { + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->ac.i32, stream, false)), + numprims, LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5105); + } } static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) @@ -3363,7 +3949,38 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) const LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); - /* TODO: streamout */ + /* Streamout */ + if (ctx->shader_info->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); + + LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->shader_info->gs.num_stream_output_components[stream]) + continue; + + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implicit C-style array */ + ctx->ac.i32_1, /* second value of struct */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); + } + + for (unsigned i = 0; i < verts_per_prim; ++i) { + tmp = LLVMBuildSub(builder, tid, + LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); + tmp = ngg_gs_vertex_ptr(ctx, tmp); + nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); + } + + build_streamout(ctx, &nggso); + } /* TODO: culling */ @@ -3514,31 +4131,24 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); - if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) { - outinfo->writes_pointsize = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) { - outinfo->writes_layer = true; - } - - if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) { - outinfo->writes_viewport_index = true; - } - unsigned out_idx = 0; gep_idx[1] = ctx->ac.i32_0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->shader_info->gs.output_usage_mask[i]; + int length = util_last_bit(output_usage_mask); + if (!(ctx->output_mask & (1ull << i))) continue; outputs[noutput].slot_name = i; outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; - - outputs[noutput].usage_mask = ctx->shader_info->gs.output_usage_mask[i]; - int length = util_last_bit(outputs[noutput].usage_mask); + outputs[noutput].usage_mask = output_usage_mask; for (unsigned j = 0; j < length; j++, out_idx++) { + if (!(output_usage_mask & (1 << j))) + continue; + gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); tmp = LLVMBuildLoad(builder, tmp, ""); @@ -3560,8 +4170,6 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) /* Export ViewIndex. */ if (export_view_index) { - outinfo->writes_layer = true; - outputs[noutput].slot_name = VARYING_SLOT_LAYER; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; @@ -3594,7 +4202,7 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, const LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, vertexidx, LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - ac_build_kill_if_false(&ctx->ac, can_emit); + ac_build_ifcc(&ctx->ac, can_emit, 9001); tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); @@ -3634,7 +4242,7 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, LLVMBuildStore(builder, out_val, ptr); } } - assert(out_idx * 4 <= ctx->gsvs_vertex_size); + assert(out_idx * 4 <= ctx->shader_info->gs.gsvs_vertex_size); /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); @@ -3660,6 +4268,8 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); + + ac_build_endif(&ctx->ac, 9001); } static void @@ -3929,7 +4539,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, else if (ctx->options->key.vs_common_out.as_es) handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info); else if (ctx->options->key.vs_common_out.as_ngg) - break; /* handled outside of the shader body */ + handle_ngg_outputs_post_1(ctx); else handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, ctx->options->key.vs_common_out.export_clip_dists, @@ -3948,7 +4558,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, if (ctx->options->key.vs_common_out.as_es) handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info); else if (ctx->options->key.vs_common_out.as_ngg) - break; /* handled outside of the shader body */ + handle_ngg_outputs_post_1(ctx); else handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, ctx->options->key.vs_common_out.export_clip_dists, @@ -4167,11 +4777,6 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, options->family, float_mode, options->wave_size, 64); ctx.context = ctx.ac.context; - radv_nir_shader_info_init(shader_info); - - for(int i = 0; i < shader_count; ++i) - radv_nir_shader_info_pass(shaders[i], options, shader_info); - for (i = 0; i < MAX_SETS; i++) shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; for (i = 0; i < AC_UD_MAX_UD; i++) @@ -4213,6 +4818,28 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); + if (shaders[shader_count - 1]->info.stage != MESA_SHADER_GEOMETRY && + (ctx.options->key.vs_common_out.as_ngg && + !ctx.options->key.vs_common_out.as_es)) { + /* Unconditionally declare scratch space base for streamout and + * vertex compaction. Whether space is actually allocated is + * determined during linking / PM4 creation. + * + * Add an extra dword per vertex to ensure an odd stride, which + * avoids bank conflicts for SoA accesses. + */ + declare_esgs_ring(&ctx); + + /* This is really only needed when streamout and / or vertex + * compaction is enabled. + */ + LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); + ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, + asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + } + for(int i = 0; i < shader_count; ++i) { ctx.stage = shaders[i]->info.stage; ctx.shader = shaders[i]; @@ -4231,18 +4858,21 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - /* TODO: streamout */ + unsigned scratch_size = 8; + if (ctx.shader_info->so.num_outputs) + scratch_size = 44; - LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); + LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size); ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32)); LLVMSetAlignment(ctx.gs_ngg_scratch, 4); - ctx.gs_ngg_emit = LLVMBuildIntToPtr(ctx.ac.builder, ctx.ac.i32_0, - LLVMPointerType(LLVMArrayType(ctx.ac.i32, 0), AC_ADDR_SPACE_LDS), - "ngg_emit"); + ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx.ac.module, + LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage); + LLVMSetAlignment(ctx.gs_ngg_emit, 4); } ctx.abi.load_inputs = load_gs_input; @@ -4311,14 +4941,6 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, nir_foreach_variable(variable, &shaders[i]->outputs) scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage); - if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { - unsigned addclip = shaders[i]->info.clip_distance_array_size + - shaders[i]->info.cull_distance_array_size > 4; - ctx.gsvs_vertex_size = (util_bitcount64(ctx.output_mask) + addclip) * 16; - ctx.max_gsvs_emit_size = ctx.gsvs_vertex_size * - shaders[i]->info.gs.vertices_out; - } - ac_setup_rings(&ctx); LLVMBasicBlockRef merge_block; @@ -4355,16 +4977,13 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, if (is_pre_gs_stage(shaders[i]->info.stage) && ctx.options->key.vs_common_out.as_ngg && i == shader_count - 1) { - handle_ngg_outputs_post(&ctx); + handle_ngg_outputs_post_2(&ctx); } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && ctx.options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_2(&ctx); } - if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { - shader_info->gs.gsvs_vertex_size = ctx.gsvs_vertex_size; - shader_info->gs.max_gsvs_emit_size = ctx.max_gsvs_emit_size; - } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { + if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { shader_info->tcs.num_patches = ctx.tcs_num_patches; shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx); } @@ -4444,7 +5063,7 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, fprintf(stderr, "\n"); } - if (options->record_llvm_ir) { + if (options->record_ir) { char *llvm_ir = LLVMPrintModuleToString(llvm_module); llvm_ir_string = strdup(llvm_ir); LLVMDisposeMessage(llvm_ir); @@ -4516,7 +5135,8 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMValueRef stream_id; /* Fetch the vertex stream ID. */ - if (ctx->shader_info->so.num_outputs) { + if (!ctx->options->use_ngg_streamout && + ctx->shader_info->so.num_outputs) { stream_id = ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2); } else { @@ -4536,7 +5156,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMBasicBlockRef bb; unsigned offset; - if (!num_components) + if (stream > 0 && !num_components) continue; if (stream > 0 && !ctx->shader_info->so.num_outputs) @@ -4587,7 +5207,8 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) } } - if (ctx->shader_info->so.num_outputs) + if (!ctx->options->use_ngg_streamout && + ctx->shader_info->so.num_outputs) radv_emit_streamout(ctx, stream); if (stream == 0) { @@ -4624,8 +5245,6 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, ctx.stage = MESA_SHADER_VERTEX; ctx.shader = geom_shader; - radv_nir_shader_info_pass(geom_shader, options, shader_info); - create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX); ac_setup_rings(&ctx);