2a609572d841c1c17b75edb5e68f7efee2460beb
[mesa.git] / src / gallium / drivers / radeonsi / si_shader_llvm_gs.c
1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29
30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31 {
32 /* Return true if the current thread should execute an ES thread. */
33 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
34 si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
35 }
36
37 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
38 {
39 /* Return true if the current thread should execute a GS thread. */
40 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
41 si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
42 }
43
44 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
45 unsigned vtx_offset_param, LLVMTypeRef type,
46 unsigned swizzle)
47 {
48 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
49 struct si_shader *shader = ctx->shader;
50 LLVMValueRef vtx_offset, soffset;
51 struct si_shader_info *info = &shader->selector->info;
52 unsigned semantic_name = info->input_semantic_name[input_index];
53 unsigned semantic_index = info->input_semantic_index[input_index];
54 unsigned param;
55 LLVMValueRef value;
56
57 param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
58
59 /* GFX9 has the ESGS ring in LDS. */
60 if (ctx->screen->info.chip_class >= GFX9) {
61 unsigned index = vtx_offset_param;
62
63 switch (index / 2) {
64 case 0:
65 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
66 break;
67 case 1:
68 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
69 break;
70 case 2:
71 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
72 break;
73 default:
74 assert(0);
75 return NULL;
76 }
77
78 unsigned offset = param * 4 + swizzle;
79 vtx_offset =
80 LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
81
82 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
83 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
84 if (ac_get_type_size(type) == 8) {
85 ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, "");
86 LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")};
87 value = ac_build_gather_values(&ctx->ac, values, 2);
88 }
89 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
90 }
91
92 /* GFX6: input load from the ESGS ring in memory. */
93 if (swizzle == ~0) {
94 LLVMValueRef values[4];
95 unsigned chan;
96 for (chan = 0; chan < 4; chan++) {
97 values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan);
98 }
99 return ac_build_gather_values(&ctx->ac, values, 4);
100 }
101
102 /* Get the vertex offset parameter on GFX6. */
103 LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
104
105 vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
106
107 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
108
109 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
110 ac_glc, true, false);
111 if (ac_get_type_size(type) == 8) {
112 LLVMValueRef value2;
113 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
114
115 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset,
116 0, ac_glc, true, false);
117 return si_build_gather_64bit(ctx, type, value, value2);
118 }
119 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
120 }
121
122 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location,
123 unsigned driver_location, unsigned component,
124 unsigned num_components, unsigned vertex_index,
125 unsigned const_index, LLVMTypeRef type)
126 {
127 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
128
129 LLVMValueRef value[4];
130 for (unsigned i = 0; i < num_components; i++) {
131 unsigned offset = i;
132 if (ac_get_type_size(type) == 8)
133 offset *= 2;
134
135 offset += component;
136 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
137 vertex_index, type, offset);
138 }
139
140 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
141 }
142
143 /* Pass GS inputs from ES to GS on GFX9. */
144 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
145 {
146 LLVMValueRef ret = ctx->return_value;
147
148 ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
149 ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
150 if (ctx->shader->key.as_ngg)
151 ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
152 else
153 ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
154 ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
155 ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
156
157 ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
158 ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
159 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
160 if (ctx->screen->use_ngg) {
161 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
162 }
163
164 unsigned vgpr;
165 if (ctx->type == PIPE_SHADER_VERTEX)
166 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
167 else
168 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
169
170 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
171 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
172 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
173 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
174 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
175 ctx->return_value = ret;
176 }
177
178 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
179 {
180 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
181 struct si_shader *es = ctx->shader;
182 struct si_shader_info *info = &es->selector->info;
183 LLVMValueRef lds_base = NULL;
184 unsigned chan;
185 int i;
186
187 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
188 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
189 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
190 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
191 vertex_idx =
192 LLVMBuildOr(ctx->ac.builder, vertex_idx,
193 LLVMBuildMul(ctx->ac.builder, wave_idx,
194 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
195 "");
196 lds_base =
197 LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
198 }
199
200 for (i = 0; i < info->num_outputs; i++) {
201 int param;
202
203 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
204 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
205 continue;
206
207 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
208 info->output_semantic_index[i], false);
209
210 for (chan = 0; chan < 4; chan++) {
211 if (!(info->output_usagemask[i] & (1 << chan)))
212 continue;
213
214 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
215 out_val = ac_to_integer(&ctx->ac, out_val);
216
217 /* GFX9 has the ESGS ring in LDS. */
218 if (ctx->screen->info.chip_class >= GFX9) {
219 LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
220 idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
221 ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
222 continue;
223 }
224
225 ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
226 ac_get_arg(&ctx->ac, ctx->es2gs_offset),
227 (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
228 }
229 }
230
231 if (ctx->screen->info.chip_class >= GFX9)
232 si_set_es_return_value_for_gs(ctx);
233 }
234
235 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
236 {
237 if (ctx->screen->info.chip_class >= GFX9)
238 return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
239 else
240 return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
241 }
242
243 static void emit_gs_epilogue(struct si_shader_context *ctx)
244 {
245 if (ctx->shader->key.as_ngg) {
246 gfx10_ngg_gs_emit_epilogue(ctx);
247 return;
248 }
249
250 if (ctx->screen->info.chip_class >= GFX10)
251 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
252
253 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
254
255 if (ctx->screen->info.chip_class >= GFX9)
256 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
257 }
258
259 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
260 LLVMValueRef *addrs)
261 {
262 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
263 struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
264
265 assert(info->num_outputs <= max_outputs);
266
267 emit_gs_epilogue(ctx);
268 }
269
270 /* Emit one vertex from the geometry shader */
271 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
272 {
273 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
274
275 if (ctx->shader->key.as_ngg) {
276 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
277 return;
278 }
279
280 struct si_shader_info *info = &ctx->shader->selector->info;
281 struct si_shader *shader = ctx->shader;
282 LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
283 LLVMValueRef gs_next_vertex;
284 LLVMValueRef can_emit;
285 unsigned chan, offset;
286 int i;
287
288 /* Write vertex attribute values to GSVS ring */
289 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
290
291 /* If this thread has already emitted the declared maximum number of
292 * vertices, skip the write: excessive vertex emissions are not
293 * supposed to have any effect.
294 *
295 * If the shader has no writes to memory, kill it instead. This skips
296 * further memory loads and may allow LLVM to skip to the end
297 * altogether.
298 */
299 can_emit =
300 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
301 LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), "");
302
303 bool use_kill = !info->writes_memory;
304 if (use_kill) {
305 ac_build_kill_if_false(&ctx->ac, can_emit);
306 } else {
307 ac_build_ifcc(&ctx->ac, can_emit, 6505);
308 }
309
310 offset = 0;
311 for (i = 0; i < info->num_outputs; i++) {
312 for (chan = 0; chan < 4; chan++) {
313 if (!(info->output_usagemask[i] & (1 << chan)) ||
314 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
315 continue;
316
317 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
318 LLVMValueRef voffset =
319 LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0);
320 offset++;
321
322 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
323 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
324
325 out_val = ac_to_integer(&ctx->ac, out_val);
326
327 ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
328 0, ac_glc | ac_slc | ac_swizzled);
329 }
330 }
331
332 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
333 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
334
335 /* Signal vertex emission if vertex data was written. */
336 if (offset) {
337 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
338 si_get_gs_wave_id(ctx));
339 }
340
341 if (!use_kill)
342 ac_build_endif(&ctx->ac, 6505);
343 }
344
345 /* Cut one primitive from the geometry shader */
346 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
347 {
348 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
349
350 if (ctx->shader->key.as_ngg) {
351 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
352 return;
353 }
354
355 /* Signal primitive cut */
356 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
357 si_get_gs_wave_id(ctx));
358 }
359
360 void si_preload_esgs_ring(struct si_shader_context *ctx)
361 {
362 if (ctx->screen->info.chip_class <= GFX8) {
363 unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
364 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
365 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
366
367 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
368 } else {
369 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
370 /* Declare the ESGS ring as an explicit LDS symbol. */
371 si_llvm_declare_esgs_ring(ctx);
372 } else {
373 ac_declare_lds_as_pointer(&ctx->ac);
374 ctx->esgs_ring = ctx->ac.lds;
375 }
376 }
377 }
378
379 void si_preload_gs_rings(struct si_shader_context *ctx)
380 {
381 const struct si_shader_selector *sel = ctx->shader->selector;
382 LLVMBuilderRef builder = ctx->ac.builder;
383 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
384 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
385 LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
386
387 /* The conceptual layout of the GSVS ring is
388 * v0c0 .. vLv0 v0c1 .. vLc1 ..
389 * but the real memory layout is swizzled across
390 * threads:
391 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
392 * t16v0c0 ..
393 * Override the buffer descriptor accordingly.
394 */
395 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
396 uint64_t stream_offset = 0;
397
398 for (unsigned stream = 0; stream < 4; ++stream) {
399 unsigned num_components;
400 unsigned stride;
401 unsigned num_records;
402 LLVMValueRef ring, tmp;
403
404 num_components = sel->info.num_stream_output_components[stream];
405 if (!num_components)
406 continue;
407
408 stride = 4 * num_components * sel->gs_max_out_vertices;
409
410 /* Limit on the stride field for <= GFX7. */
411 assert(stride < (1 << 14));
412
413 num_records = ctx->ac.wave_size;
414
415 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
416 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
417 tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
418 stream_offset += stride * ctx->ac.wave_size;
419
420 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
421 ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
422 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
423 tmp = LLVMBuildOr(
424 builder, tmp,
425 LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
426 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
427 ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
428 LLVMConstInt(ctx->ac.i32, 2, 0), "");
429
430 uint32_t rsrc3 =
431 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
432 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
433 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
434 S_008F0C_ADD_TID_ENABLE(1);
435
436 if (ctx->ac.chip_class >= GFX10) {
437 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
438 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
439 } else {
440 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
441 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
442 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
443 }
444
445 ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
446 LLVMConstInt(ctx->ac.i32, 3, 0), "");
447
448 ctx->gsvs_ring[stream] = ring;
449 }
450 }
451
452 /* Generate code for the hardware VS shader stage to go with a geometry shader */
453 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
454 struct ac_llvm_compiler *compiler,
455 struct si_shader_selector *gs_selector,
456 struct pipe_debug_callback *debug)
457 {
458 struct si_shader_context ctx;
459 struct si_shader *shader;
460 LLVMBuilderRef builder;
461 struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
462 struct si_shader_info *gsinfo = &gs_selector->info;
463 int i;
464
465 shader = CALLOC_STRUCT(si_shader);
466 if (!shader)
467 return NULL;
468
469 /* We can leave the fence as permanently signaled because the GS copy
470 * shader only becomes visible globally after it has been compiled. */
471 util_queue_fence_init(&shader->ready);
472
473 shader->selector = gs_selector;
474 shader->is_gs_copy_shader = true;
475
476 si_llvm_context_init(&ctx, sscreen, compiler,
477 si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
478 ctx.shader = shader;
479 ctx.type = PIPE_SHADER_VERTEX;
480
481 builder = ctx.ac.builder;
482
483 si_create_function(&ctx, false);
484
485 LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
486 ctx.gsvs_ring[0] =
487 ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
488
489 LLVMValueRef voffset =
490 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
491
492 /* Fetch the vertex stream ID.*/
493 LLVMValueRef stream_id;
494
495 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
496 stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
497 else
498 stream_id = ctx.ac.i32_0;
499
500 /* Fill in output information. */
501 for (i = 0; i < gsinfo->num_outputs; ++i) {
502 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
503 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
504
505 for (int chan = 0; chan < 4; chan++) {
506 outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
507 }
508 }
509
510 LLVMBasicBlockRef end_bb;
511 LLVMValueRef switch_inst;
512
513 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
514 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
515
516 for (int stream = 0; stream < 4; stream++) {
517 LLVMBasicBlockRef bb;
518 unsigned offset;
519
520 if (!gsinfo->num_stream_output_components[stream])
521 continue;
522
523 if (stream > 0 && !gs_selector->so.num_outputs)
524 continue;
525
526 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
527 LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
528 LLVMPositionBuilderAtEnd(builder, bb);
529
530 /* Fetch vertex data from GSVS ring */
531 offset = 0;
532 for (i = 0; i < gsinfo->num_outputs; ++i) {
533 for (unsigned chan = 0; chan < 4; chan++) {
534 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
535 outputs[i].vertex_stream[chan] != stream) {
536 outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
537 continue;
538 }
539
540 LLVMValueRef soffset =
541 LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
542 offset++;
543
544 outputs[i].values[chan] =
545 ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
546 ac_glc | ac_slc, true, false);
547 }
548 }
549
550 /* Streamout and exports. */
551 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
552 si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
553 }
554
555 if (stream == 0)
556 si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
557
558 LLVMBuildBr(builder, end_bb);
559 }
560
561 LLVMPositionBuilderAtEnd(builder, end_bb);
562
563 LLVMBuildRetVoid(ctx.ac.builder);
564
565 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
566 si_llvm_optimize_module(&ctx);
567
568 bool ok = false;
569 if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
570 debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader", false)) {
571 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
572 fprintf(stderr, "GS Copy Shader:\n");
573 si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
574
575 if (!ctx.shader->config.scratch_bytes_per_wave)
576 ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
577 else
578 ok = true;
579 }
580
581 si_llvm_dispose(&ctx);
582
583 if (!ok) {
584 FREE(shader);
585 shader = NULL;
586 } else {
587 si_fix_resource_usage(sscreen, shader);
588 }
589 return shader;
590 }
591
592 /**
593 * Build the GS prolog function. Rotate the input vertices for triangle strips
594 * with adjacency.
595 */
596 void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
597 {
598 unsigned num_sgprs, num_vgprs;
599 LLVMBuilderRef builder = ctx->ac.builder;
600 LLVMTypeRef returns[AC_MAX_ARGS];
601 LLVMValueRef func, ret;
602
603 memset(&ctx->args, 0, sizeof(ctx->args));
604
605 if (ctx->screen->info.chip_class >= GFX9) {
606 if (key->gs_prolog.states.gfx9_prev_is_vs)
607 num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
608 else
609 num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
610 num_vgprs = 5; /* ES inputs are not needed by GS */
611 } else {
612 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
613 num_vgprs = 8;
614 }
615
616 for (unsigned i = 0; i < num_sgprs; ++i) {
617 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
618 returns[i] = ctx->ac.i32;
619 }
620
621 for (unsigned i = 0; i < num_vgprs; ++i) {
622 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
623 returns[num_sgprs + i] = ctx->ac.f32;
624 }
625
626 /* Create the function. */
627 si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
628 func = ctx->main_fn;
629
630 /* Set the full EXEC mask for the prolog, because we are only fiddling
631 * with registers here. The main shader part will set the correct EXEC
632 * mask.
633 */
634 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
635 ac_init_exec_full_mask(&ctx->ac);
636
637 /* Copy inputs to outputs. This should be no-op, as the registers match,
638 * but it will prevent the compiler from overwriting them unintentionally.
639 */
640 ret = ctx->return_value;
641 for (unsigned i = 0; i < num_sgprs; i++) {
642 LLVMValueRef p = LLVMGetParam(func, i);
643 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
644 }
645 for (unsigned i = 0; i < num_vgprs; i++) {
646 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
647 p = ac_to_float(&ctx->ac, p);
648 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
649 }
650
651 if (key->gs_prolog.states.tri_strip_adj_fix) {
652 /* Remap the input vertices for every other primitive. */
653 const struct ac_arg gfx6_vtx_params[6] = {
654 {.used = true, .arg_index = num_sgprs}, {.used = true, .arg_index = num_sgprs + 1},
655 {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
656 {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
657 };
658 const struct ac_arg gfx9_vtx_params[3] = {
659 {.used = true, .arg_index = num_sgprs},
660 {.used = true, .arg_index = num_sgprs + 1},
661 {.used = true, .arg_index = num_sgprs + 4},
662 };
663 LLVMValueRef vtx_in[6], vtx_out[6];
664 LLVMValueRef prim_id, rotate;
665
666 if (ctx->screen->info.chip_class >= GFX9) {
667 for (unsigned i = 0; i < 3; i++) {
668 vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
669 vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
670 }
671 } else {
672 for (unsigned i = 0; i < 6; i++)
673 vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
674 }
675
676 prim_id = LLVMGetParam(func, num_sgprs + 2);
677 rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
678
679 for (unsigned i = 0; i < 6; ++i) {
680 LLVMValueRef base, rotated;
681 base = vtx_in[i];
682 rotated = vtx_in[(i + 4) % 6];
683 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
684 }
685
686 if (ctx->screen->info.chip_class >= GFX9) {
687 for (unsigned i = 0; i < 3; i++) {
688 LLVMValueRef hi, out;
689
690 hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
691 out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
692 out = ac_to_float(&ctx->ac, out);
693 ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
694 }
695 } else {
696 for (unsigned i = 0; i < 6; i++) {
697 LLVMValueRef out;
698
699 out = ac_to_float(&ctx->ac, vtx_out[i]);
700 ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
701 }
702 }
703 }
704
705 LLVMBuildRet(builder, ret);
706 }
707
708 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
709 {
710 ctx->abi.load_inputs = si_nir_load_input_gs;
711 ctx->abi.emit_vertex = si_llvm_emit_vertex;
712 ctx->abi.emit_primitive = si_llvm_emit_primitive;
713 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
714 }