radeonsi/gfx10: fix the wave size for compute-based culling
[mesa.git] / src / gallium / drivers / radeonsi / si_shader_llvm_gs.c
1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_shader_internal.h"
26 #include "si_pipe.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29
30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31 {
32 /* Return true if the current thread should execute an ES thread. */
33 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
34 ac_get_thread_id(&ctx->ac),
35 si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
36 }
37
38 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
39 {
40 /* Return true if the current thread should execute a GS thread. */
41 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
42 ac_get_thread_id(&ctx->ac),
43 si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
44 }
45
46 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
47 unsigned input_index,
48 unsigned vtx_offset_param,
49 LLVMTypeRef type,
50 unsigned swizzle)
51 {
52 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
53 struct si_shader *shader = ctx->shader;
54 LLVMValueRef vtx_offset, soffset;
55 struct si_shader_info *info = &shader->selector->info;
56 unsigned semantic_name = info->input_semantic_name[input_index];
57 unsigned semantic_index = info->input_semantic_index[input_index];
58 unsigned param;
59 LLVMValueRef value;
60
61 param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
62
63 /* GFX9 has the ESGS ring in LDS. */
64 if (ctx->screen->info.chip_class >= GFX9) {
65 unsigned index = vtx_offset_param;
66
67 switch (index / 2) {
68 case 0:
69 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
70 index % 2 ? 16 : 0, 16);
71 break;
72 case 1:
73 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
74 index % 2 ? 16 : 0, 16);
75 break;
76 case 2:
77 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
78 index % 2 ? 16 : 0, 16);
79 break;
80 default:
81 assert(0);
82 return NULL;
83 }
84
85 unsigned offset = param * 4 + swizzle;
86 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
87 LLVMConstInt(ctx->ac.i32, offset, false), "");
88
89 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
90 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
91 if (ac_get_type_size(type) == 8) {
92 ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
93 &ctx->ac.i32_1, 1, "");
94 LLVMValueRef values[2] = {
95 value,
96 LLVMBuildLoad(ctx->ac.builder, ptr, "")
97 };
98 value = ac_build_gather_values(&ctx->ac, values, 2);
99 }
100 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
101 }
102
103 /* GFX6: input load from the ESGS ring in memory. */
104 if (swizzle == ~0) {
105 LLVMValueRef values[4];
106 unsigned chan;
107 for (chan = 0; chan < 4; chan++) {
108 values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
109 type, chan);
110 }
111 return ac_build_gather_values(&ctx->ac, values, 4);
112 }
113
114 /* Get the vertex offset parameter on GFX6. */
115 LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
116 ctx->gs_vtx_offset[vtx_offset_param]);
117
118 vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
119 LLVMConstInt(ctx->ac.i32, 4, 0), "");
120
121 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
122
123 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0,
124 vtx_offset, soffset, 0, ac_glc, true, false);
125 if (ac_get_type_size(type) == 8) {
126 LLVMValueRef value2;
127 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
128
129 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
130 ctx->ac.i32_0, vtx_offset, soffset,
131 0, ac_glc, true, false);
132 return si_build_gather_64bit(ctx, type, value, value2);
133 }
134 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
135 }
136
137 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
138 unsigned location,
139 unsigned driver_location,
140 unsigned component,
141 unsigned num_components,
142 unsigned vertex_index,
143 unsigned const_index,
144 LLVMTypeRef type)
145 {
146 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
147
148 LLVMValueRef value[4];
149 for (unsigned i = 0; i < num_components; i++) {
150 unsigned offset = i;
151 if (ac_get_type_size(type) == 8)
152 offset *= 2;
153
154 offset += component;
155 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
156 vertex_index, type, offset);
157 }
158
159 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
160 }
161
162 /* Pass GS inputs from ES to GS on GFX9. */
163 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
164 {
165 LLVMValueRef ret = ctx->return_value;
166
167 ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
168 ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
169 if (ctx->shader->key.as_ngg)
170 ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
171 else
172 ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
173 ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
174 ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
175
176 ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
177 8 + SI_SGPR_RW_BUFFERS);
178 ret = si_insert_input_ptr(ctx, ret,
179 ctx->bindless_samplers_and_images,
180 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
181 if (ctx->screen->use_ngg) {
182 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
183 8 + SI_SGPR_VS_STATE_BITS);
184 }
185
186 unsigned vgpr;
187 if (ctx->type == PIPE_SHADER_VERTEX)
188 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
189 else
190 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
191
192 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
193 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
194 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
195 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
196 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
197 ctx->return_value = ret;
198 }
199
200 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
201 LLVMValueRef *addrs)
202 {
203 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
204 struct si_shader *es = ctx->shader;
205 struct si_shader_info *info = &es->selector->info;
206 LLVMValueRef lds_base = NULL;
207 unsigned chan;
208 int i;
209
210 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
211 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
212 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
213 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
214 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
215 LLVMBuildMul(ctx->ac.builder, wave_idx,
216 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), "");
217 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
218 LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
219 }
220
221 for (i = 0; i < info->num_outputs; i++) {
222 int param;
223
224 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
225 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
226 continue;
227
228 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
229 info->output_semantic_index[i], false);
230
231 for (chan = 0; chan < 4; chan++) {
232 if (!(info->output_usagemask[i] & (1 << chan)))
233 continue;
234
235 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
236 out_val = ac_to_integer(&ctx->ac, out_val);
237
238 /* GFX9 has the ESGS ring in LDS. */
239 if (ctx->screen->info.chip_class >= GFX9) {
240 LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
241 idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
242 ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
243 continue;
244 }
245
246 ac_build_buffer_store_dword(&ctx->ac,
247 ctx->esgs_ring,
248 out_val, 1, NULL,
249 ac_get_arg(&ctx->ac, ctx->es2gs_offset),
250 (4 * param + chan) * 4,
251 ac_glc | ac_slc | ac_swizzled);
252 }
253 }
254
255 if (ctx->screen->info.chip_class >= GFX9)
256 si_set_es_return_value_for_gs(ctx);
257 }
258
259 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
260 {
261 if (ctx->screen->info.chip_class >= GFX9)
262 return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
263 else
264 return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
265 }
266
267 static void emit_gs_epilogue(struct si_shader_context *ctx)
268 {
269 if (ctx->shader->key.as_ngg) {
270 gfx10_ngg_gs_emit_epilogue(ctx);
271 return;
272 }
273
274 if (ctx->screen->info.chip_class >= GFX10)
275 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
276
277 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
278 si_get_gs_wave_id(ctx));
279
280 if (ctx->screen->info.chip_class >= GFX9)
281 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
282 }
283
284 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
285 unsigned max_outputs,
286 LLVMValueRef *addrs)
287 {
288 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
289 struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
290
291 assert(info->num_outputs <= max_outputs);
292
293 emit_gs_epilogue(ctx);
294 }
295
296 /* Emit one vertex from the geometry shader */
297 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
298 unsigned stream,
299 LLVMValueRef *addrs)
300 {
301 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
302
303 if (ctx->shader->key.as_ngg) {
304 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
305 return;
306 }
307
308 struct si_shader_info *info = &ctx->shader->selector->info;
309 struct si_shader *shader = ctx->shader;
310 LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
311 LLVMValueRef gs_next_vertex;
312 LLVMValueRef can_emit;
313 unsigned chan, offset;
314 int i;
315
316 /* Write vertex attribute values to GSVS ring */
317 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
318 ctx->gs_next_vertex[stream],
319 "");
320
321 /* If this thread has already emitted the declared maximum number of
322 * vertices, skip the write: excessive vertex emissions are not
323 * supposed to have any effect.
324 *
325 * If the shader has no writes to memory, kill it instead. This skips
326 * further memory loads and may allow LLVM to skip to the end
327 * altogether.
328 */
329 can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
330 LLVMConstInt(ctx->ac.i32,
331 shader->selector->gs_max_out_vertices, 0), "");
332
333 bool use_kill = !info->writes_memory;
334 if (use_kill) {
335 ac_build_kill_if_false(&ctx->ac, can_emit);
336 } else {
337 ac_build_ifcc(&ctx->ac, can_emit, 6505);
338 }
339
340 offset = 0;
341 for (i = 0; i < info->num_outputs; i++) {
342 for (chan = 0; chan < 4; chan++) {
343 if (!(info->output_usagemask[i] & (1 << chan)) ||
344 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
345 continue;
346
347 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
348 LLVMValueRef voffset =
349 LLVMConstInt(ctx->ac.i32, offset *
350 shader->selector->gs_max_out_vertices, 0);
351 offset++;
352
353 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
354 voffset = LLVMBuildMul(ctx->ac.builder, voffset,
355 LLVMConstInt(ctx->ac.i32, 4, 0), "");
356
357 out_val = ac_to_integer(&ctx->ac, out_val);
358
359 ac_build_buffer_store_dword(&ctx->ac,
360 ctx->gsvs_ring[stream],
361 out_val, 1,
362 voffset, soffset, 0,
363 ac_glc | ac_slc | ac_swizzled);
364 }
365 }
366
367 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
368 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
369
370 /* Signal vertex emission if vertex data was written. */
371 if (offset) {
372 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
373 si_get_gs_wave_id(ctx));
374 }
375
376 if (!use_kill)
377 ac_build_endif(&ctx->ac, 6505);
378 }
379
380 /* Cut one primitive from the geometry shader */
381 static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
382 unsigned stream)
383 {
384 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
385
386 if (ctx->shader->key.as_ngg) {
387 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
388 return;
389 }
390
391 /* Signal primitive cut */
392 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
393 si_get_gs_wave_id(ctx));
394 }
395
396 void si_preload_esgs_ring(struct si_shader_context *ctx)
397 {
398 if (ctx->screen->info.chip_class <= GFX8) {
399 unsigned ring =
400 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
401 : SI_ES_RING_ESGS;
402 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
403 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
404
405 ctx->esgs_ring =
406 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
407 } else {
408 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
409 /* Declare the ESGS ring as an explicit LDS symbol. */
410 si_llvm_declare_esgs_ring(ctx);
411 } else {
412 ac_declare_lds_as_pointer(&ctx->ac);
413 ctx->esgs_ring = ctx->ac.lds;
414 }
415 }
416 }
417
418 void si_preload_gs_rings(struct si_shader_context *ctx)
419 {
420 const struct si_shader_selector *sel = ctx->shader->selector;
421 LLVMBuilderRef builder = ctx->ac.builder;
422 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
423 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
424 LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
425
426 /* The conceptual layout of the GSVS ring is
427 * v0c0 .. vLv0 v0c1 .. vLc1 ..
428 * but the real memory layout is swizzled across
429 * threads:
430 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
431 * t16v0c0 ..
432 * Override the buffer descriptor accordingly.
433 */
434 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
435 uint64_t stream_offset = 0;
436
437 for (unsigned stream = 0; stream < 4; ++stream) {
438 unsigned num_components;
439 unsigned stride;
440 unsigned num_records;
441 LLVMValueRef ring, tmp;
442
443 num_components = sel->info.num_stream_output_components[stream];
444 if (!num_components)
445 continue;
446
447 stride = 4 * num_components * sel->gs_max_out_vertices;
448
449 /* Limit on the stride field for <= GFX7. */
450 assert(stride < (1 << 14));
451
452 num_records = ctx->ac.wave_size;
453
454 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
455 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
456 tmp = LLVMBuildAdd(builder, tmp,
457 LLVMConstInt(ctx->ac.i64,
458 stream_offset, 0), "");
459 stream_offset += stride * ctx->ac.wave_size;
460
461 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
462 ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
463 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
464 tmp = LLVMBuildOr(builder, tmp,
465 LLVMConstInt(ctx->ac.i32,
466 S_008F04_STRIDE(stride) |
467 S_008F04_SWIZZLE_ENABLE(1), 0), "");
468 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
469 ring = LLVMBuildInsertElement(builder, ring,
470 LLVMConstInt(ctx->ac.i32, num_records, 0),
471 LLVMConstInt(ctx->ac.i32, 2, 0), "");
472
473 uint32_t rsrc3 =
474 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
475 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
476 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
477 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
478 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
479 S_008F0C_ADD_TID_ENABLE(1);
480
481 if (ctx->ac.chip_class >= GFX10) {
482 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
483 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
484 S_008F0C_RESOURCE_LEVEL(1);
485 } else {
486 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
487 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
488 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
489 }
490
491 ring = LLVMBuildInsertElement(builder, ring,
492 LLVMConstInt(ctx->ac.i32, rsrc3, false),
493 LLVMConstInt(ctx->ac.i32, 3, 0), "");
494
495 ctx->gsvs_ring[stream] = ring;
496 }
497 }
498
499 /* Generate code for the hardware VS shader stage to go with a geometry shader */
500 struct si_shader *
501 si_generate_gs_copy_shader(struct si_screen *sscreen,
502 struct ac_llvm_compiler *compiler,
503 struct si_shader_selector *gs_selector,
504 struct pipe_debug_callback *debug)
505 {
506 struct si_shader_context ctx;
507 struct si_shader *shader;
508 LLVMBuilderRef builder;
509 struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
510 struct si_shader_info *gsinfo = &gs_selector->info;
511 int i;
512
513
514 shader = CALLOC_STRUCT(si_shader);
515 if (!shader)
516 return NULL;
517
518 /* We can leave the fence as permanently signaled because the GS copy
519 * shader only becomes visible globally after it has been compiled. */
520 util_queue_fence_init(&shader->ready);
521
522 shader->selector = gs_selector;
523 shader->is_gs_copy_shader = true;
524
525 si_llvm_context_init(&ctx, sscreen, compiler,
526 si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
527 false, false, false));
528 ctx.shader = shader;
529 ctx.type = PIPE_SHADER_VERTEX;
530
531 builder = ctx.ac.builder;
532
533 si_create_function(&ctx, false);
534
535 LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
536 ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
537 LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
538
539 LLVMValueRef voffset =
540 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
541 LLVMConstInt(ctx.ac.i32, 4, 0), "");
542
543 /* Fetch the vertex stream ID.*/
544 LLVMValueRef stream_id;
545
546 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
547 stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
548 else
549 stream_id = ctx.ac.i32_0;
550
551 /* Fill in output information. */
552 for (i = 0; i < gsinfo->num_outputs; ++i) {
553 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
554 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
555
556 for (int chan = 0; chan < 4; chan++) {
557 outputs[i].vertex_stream[chan] =
558 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
559 }
560 }
561
562 LLVMBasicBlockRef end_bb;
563 LLVMValueRef switch_inst;
564
565 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
566 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
567
568 for (int stream = 0; stream < 4; stream++) {
569 LLVMBasicBlockRef bb;
570 unsigned offset;
571
572 if (!gsinfo->num_stream_output_components[stream])
573 continue;
574
575 if (stream > 0 && !gs_selector->so.num_outputs)
576 continue;
577
578 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
579 LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
580 LLVMPositionBuilderAtEnd(builder, bb);
581
582 /* Fetch vertex data from GSVS ring */
583 offset = 0;
584 for (i = 0; i < gsinfo->num_outputs; ++i) {
585 for (unsigned chan = 0; chan < 4; chan++) {
586 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
587 outputs[i].vertex_stream[chan] != stream) {
588 outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
589 continue;
590 }
591
592 LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32,
593 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
594 offset++;
595
596 outputs[i].values[chan] =
597 ac_build_buffer_load(&ctx.ac,
598 ctx.gsvs_ring[0], 1,
599 ctx.ac.i32_0, voffset,
600 soffset, 0, ac_glc | ac_slc,
601 true, false);
602 }
603 }
604
605 /* Streamout and exports. */
606 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
607 si_llvm_emit_streamout(&ctx, outputs,
608 gsinfo->num_outputs,
609 stream);
610 }
611
612 if (stream == 0)
613 si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
614
615 LLVMBuildBr(builder, end_bb);
616 }
617
618 LLVMPositionBuilderAtEnd(builder, end_bb);
619
620 LLVMBuildRetVoid(ctx.ac.builder);
621
622 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
623 si_llvm_optimize_module(&ctx);
624
625 bool ok = false;
626 if (si_compile_llvm(sscreen, &ctx.shader->binary,
627 &ctx.shader->config, ctx.compiler, &ctx.ac,
628 debug, PIPE_SHADER_GEOMETRY,
629 "GS Copy Shader", false)) {
630 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
631 fprintf(stderr, "GS Copy Shader:\n");
632 si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
633
634 if (!ctx.shader->config.scratch_bytes_per_wave)
635 ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
636 else
637 ok = true;
638 }
639
640 si_llvm_dispose(&ctx);
641
642 if (!ok) {
643 FREE(shader);
644 shader = NULL;
645 } else {
646 si_fix_resource_usage(sscreen, shader);
647 }
648 return shader;
649 }
650
651 /**
652 * Build the GS prolog function. Rotate the input vertices for triangle strips
653 * with adjacency.
654 */
655 void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
656 union si_shader_part_key *key)
657 {
658 unsigned num_sgprs, num_vgprs;
659 LLVMBuilderRef builder = ctx->ac.builder;
660 LLVMTypeRef returns[AC_MAX_ARGS];
661 LLVMValueRef func, ret;
662
663 memset(&ctx->args, 0, sizeof(ctx->args));
664
665 if (ctx->screen->info.chip_class >= GFX9) {
666 if (key->gs_prolog.states.gfx9_prev_is_vs)
667 num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
668 else
669 num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
670 num_vgprs = 5; /* ES inputs are not needed by GS */
671 } else {
672 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
673 num_vgprs = 8;
674 }
675
676 for (unsigned i = 0; i < num_sgprs; ++i) {
677 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
678 returns[i] = ctx->ac.i32;
679 }
680
681 for (unsigned i = 0; i < num_vgprs; ++i) {
682 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
683 returns[num_sgprs + i] = ctx->ac.f32;
684 }
685
686 /* Create the function. */
687 si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
688 func = ctx->main_fn;
689
690 /* Set the full EXEC mask for the prolog, because we are only fiddling
691 * with registers here. The main shader part will set the correct EXEC
692 * mask.
693 */
694 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
695 ac_init_exec_full_mask(&ctx->ac);
696
697 /* Copy inputs to outputs. This should be no-op, as the registers match,
698 * but it will prevent the compiler from overwriting them unintentionally.
699 */
700 ret = ctx->return_value;
701 for (unsigned i = 0; i < num_sgprs; i++) {
702 LLVMValueRef p = LLVMGetParam(func, i);
703 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
704 }
705 for (unsigned i = 0; i < num_vgprs; i++) {
706 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
707 p = ac_to_float(&ctx->ac, p);
708 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
709 }
710
711 if (key->gs_prolog.states.tri_strip_adj_fix) {
712 /* Remap the input vertices for every other primitive. */
713 const struct ac_arg gfx6_vtx_params[6] = {
714 { .used = true, .arg_index = num_sgprs },
715 { .used = true, .arg_index = num_sgprs + 1 },
716 { .used = true, .arg_index = num_sgprs + 3 },
717 { .used = true, .arg_index = num_sgprs + 4 },
718 { .used = true, .arg_index = num_sgprs + 5 },
719 { .used = true, .arg_index = num_sgprs + 6 },
720 };
721 const struct ac_arg gfx9_vtx_params[3] = {
722 { .used = true, .arg_index = num_sgprs },
723 { .used = true, .arg_index = num_sgprs + 1 },
724 { .used = true, .arg_index = num_sgprs + 4 },
725 };
726 LLVMValueRef vtx_in[6], vtx_out[6];
727 LLVMValueRef prim_id, rotate;
728
729 if (ctx->screen->info.chip_class >= GFX9) {
730 for (unsigned i = 0; i < 3; i++) {
731 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
732 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
733 }
734 } else {
735 for (unsigned i = 0; i < 6; i++)
736 vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
737 }
738
739 prim_id = LLVMGetParam(func, num_sgprs + 2);
740 rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
741
742 for (unsigned i = 0; i < 6; ++i) {
743 LLVMValueRef base, rotated;
744 base = vtx_in[i];
745 rotated = vtx_in[(i + 4) % 6];
746 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
747 }
748
749 if (ctx->screen->info.chip_class >= GFX9) {
750 for (unsigned i = 0; i < 3; i++) {
751 LLVMValueRef hi, out;
752
753 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
754 LLVMConstInt(ctx->ac.i32, 16, 0), "");
755 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
756 out = ac_to_float(&ctx->ac, out);
757 ret = LLVMBuildInsertValue(builder, ret, out,
758 gfx9_vtx_params[i].arg_index, "");
759 }
760 } else {
761 for (unsigned i = 0; i < 6; i++) {
762 LLVMValueRef out;
763
764 out = ac_to_float(&ctx->ac, vtx_out[i]);
765 ret = LLVMBuildInsertValue(builder, ret, out,
766 gfx6_vtx_params[i].arg_index, "");
767 }
768 }
769 }
770
771 LLVMBuildRet(builder, ret);
772 }
773
774 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
775 {
776 ctx->abi.load_inputs = si_nir_load_input_gs;
777 ctx->abi.emit_vertex = si_llvm_emit_vertex;
778 ctx->abi.emit_primitive = si_llvm_emit_primitive;
779 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
780 }