2 * Copyright 2020 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "si_shader_internal.h"
28 #include "util/u_memory.h"
30 LLVMValueRef
si_is_es_thread(struct si_shader_context
*ctx
)
32 /* Return true if the current thread should execute an ES thread. */
33 return LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
, ac_get_thread_id(&ctx
->ac
),
34 si_unpack_param(ctx
, ctx
->merged_wave_info
, 0, 8), "");
37 LLVMValueRef
si_is_gs_thread(struct si_shader_context
*ctx
)
39 /* Return true if the current thread should execute a GS thread. */
40 return LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
, ac_get_thread_id(&ctx
->ac
),
41 si_unpack_param(ctx
, ctx
->merged_wave_info
, 8, 8), "");
44 static LLVMValueRef
si_llvm_load_input_gs(struct ac_shader_abi
*abi
, unsigned input_index
,
45 unsigned vtx_offset_param
, LLVMTypeRef type
,
48 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
49 struct si_shader
*shader
= ctx
->shader
;
50 LLVMValueRef vtx_offset
, soffset
;
51 struct si_shader_info
*info
= &shader
->selector
->info
;
55 param
= si_shader_io_get_unique_index(info
->input_semantic
[input_index
], false);
57 /* GFX9 has the ESGS ring in LDS. */
58 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
59 unsigned index
= vtx_offset_param
;
63 vtx_offset
= si_unpack_param(ctx
, ctx
->gs_vtx01_offset
, index
% 2 ? 16 : 0, 16);
66 vtx_offset
= si_unpack_param(ctx
, ctx
->gs_vtx23_offset
, index
% 2 ? 16 : 0, 16);
69 vtx_offset
= si_unpack_param(ctx
, ctx
->gs_vtx45_offset
, index
% 2 ? 16 : 0, 16);
76 unsigned offset
= param
* 4 + swizzle
;
78 LLVMBuildAdd(ctx
->ac
.builder
, vtx_offset
, LLVMConstInt(ctx
->ac
.i32
, offset
, false), "");
80 LLVMValueRef ptr
= ac_build_gep0(&ctx
->ac
, ctx
->esgs_ring
, vtx_offset
);
81 LLVMValueRef value
= LLVMBuildLoad(ctx
->ac
.builder
, ptr
, "");
82 if (ac_get_type_size(type
) == 8) {
83 ptr
= LLVMBuildGEP(ctx
->ac
.builder
, ptr
, &ctx
->ac
.i32_1
, 1, "");
84 LLVMValueRef values
[2] = {value
, LLVMBuildLoad(ctx
->ac
.builder
, ptr
, "")};
85 value
= ac_build_gather_values(&ctx
->ac
, values
, 2);
87 return LLVMBuildBitCast(ctx
->ac
.builder
, value
, type
, "");
90 /* GFX6: input load from the ESGS ring in memory. */
92 LLVMValueRef values
[4];
94 for (chan
= 0; chan
< 4; chan
++) {
95 values
[chan
] = si_llvm_load_input_gs(abi
, input_index
, vtx_offset_param
, type
, chan
);
97 return ac_build_gather_values(&ctx
->ac
, values
, 4);
100 /* Get the vertex offset parameter on GFX6. */
101 LLVMValueRef gs_vtx_offset
= ac_get_arg(&ctx
->ac
, ctx
->gs_vtx_offset
[vtx_offset_param
]);
103 vtx_offset
= LLVMBuildMul(ctx
->ac
.builder
, gs_vtx_offset
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), "");
105 soffset
= LLVMConstInt(ctx
->ac
.i32
, (param
* 4 + swizzle
) * 256, 0);
107 value
= ac_build_buffer_load(&ctx
->ac
, ctx
->esgs_ring
, 1, ctx
->ac
.i32_0
, vtx_offset
, soffset
, 0,
108 ac_glc
, true, false);
109 if (ac_get_type_size(type
) == 8) {
111 soffset
= LLVMConstInt(ctx
->ac
.i32
, (param
* 4 + swizzle
+ 1) * 256, 0);
113 value2
= ac_build_buffer_load(&ctx
->ac
, ctx
->esgs_ring
, 1, ctx
->ac
.i32_0
, vtx_offset
, soffset
,
114 0, ac_glc
, true, false);
115 return si_build_gather_64bit(ctx
, type
, value
, value2
);
117 return LLVMBuildBitCast(ctx
->ac
.builder
, value
, type
, "");
120 static LLVMValueRef
si_nir_load_input_gs(struct ac_shader_abi
*abi
, unsigned location
,
121 unsigned driver_location
, unsigned component
,
122 unsigned num_components
, unsigned vertex_index
,
123 unsigned const_index
, LLVMTypeRef type
)
125 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
127 LLVMValueRef value
[4];
128 for (unsigned i
= 0; i
< num_components
; i
++) {
130 if (ac_get_type_size(type
) == 8)
134 value
[i
+ component
] = si_llvm_load_input_gs(&ctx
->abi
, driver_location
/ 4 + const_index
,
135 vertex_index
, type
, offset
);
138 return ac_build_varying_gather_values(&ctx
->ac
, value
, num_components
, component
);
141 /* Pass GS inputs from ES to GS on GFX9. */
142 static void si_set_es_return_value_for_gs(struct si_shader_context
*ctx
)
144 LLVMValueRef ret
= ctx
->return_value
;
146 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->other_const_and_shader_buffers
, 0);
147 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->other_samplers_and_images
, 1);
148 if (ctx
->shader
->key
.as_ngg
)
149 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->gs_tg_info
, 2);
151 ret
= si_insert_input_ret(ctx
, ret
, ctx
->gs2vs_offset
, 2);
152 ret
= si_insert_input_ret(ctx
, ret
, ctx
->merged_wave_info
, 3);
153 ret
= si_insert_input_ret(ctx
, ret
, ctx
->merged_scratch_offset
, 5);
155 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->rw_buffers
, 8 + SI_SGPR_RW_BUFFERS
);
156 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->bindless_samplers_and_images
,
157 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES
);
158 if (ctx
->screen
->use_ngg
) {
159 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->vs_state_bits
, 8 + SI_SGPR_VS_STATE_BITS
);
163 if (ctx
->stage
== MESA_SHADER_VERTEX
)
164 vgpr
= 8 + GFX9_VSGS_NUM_USER_SGPR
;
166 vgpr
= 8 + GFX9_TESGS_NUM_USER_SGPR
;
168 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->gs_vtx01_offset
, vgpr
++);
169 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->gs_vtx23_offset
, vgpr
++);
170 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->args
.gs_prim_id
, vgpr
++);
171 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->args
.gs_invocation_id
, vgpr
++);
172 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->gs_vtx45_offset
, vgpr
++);
173 ctx
->return_value
= ret
;
176 void si_llvm_emit_es_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
, LLVMValueRef
*addrs
)
178 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
179 struct si_shader
*es
= ctx
->shader
;
180 struct si_shader_info
*info
= &es
->selector
->info
;
181 LLVMValueRef lds_base
= NULL
;
185 if (ctx
->screen
->info
.chip_class
>= GFX9
&& info
->num_outputs
) {
186 unsigned itemsize_dw
= es
->selector
->esgs_itemsize
/ 4;
187 LLVMValueRef vertex_idx
= ac_get_thread_id(&ctx
->ac
);
188 LLVMValueRef wave_idx
= si_unpack_param(ctx
, ctx
->merged_wave_info
, 24, 4);
190 LLVMBuildOr(ctx
->ac
.builder
, vertex_idx
,
191 LLVMBuildMul(ctx
->ac
.builder
, wave_idx
,
192 LLVMConstInt(ctx
->ac
.i32
, ctx
->ac
.wave_size
, false), ""),
195 LLVMBuildMul(ctx
->ac
.builder
, vertex_idx
, LLVMConstInt(ctx
->ac
.i32
, itemsize_dw
, 0), "");
198 for (i
= 0; i
< info
->num_outputs
; i
++) {
201 if (info
->output_semantic
[i
] == VARYING_SLOT_VIEWPORT
||
202 info
->output_semantic
[i
] == VARYING_SLOT_LAYER
)
205 param
= si_shader_io_get_unique_index(info
->output_semantic
[i
], false);
207 for (chan
= 0; chan
< 4; chan
++) {
208 if (!(info
->output_usagemask
[i
] & (1 << chan
)))
211 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
212 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
214 /* GFX9 has the ESGS ring in LDS. */
215 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
216 LLVMValueRef idx
= LLVMConstInt(ctx
->ac
.i32
, param
* 4 + chan
, false);
217 idx
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
, idx
, "");
218 ac_build_indexed_store(&ctx
->ac
, ctx
->esgs_ring
, idx
, out_val
);
222 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->esgs_ring
, out_val
, 1, NULL
,
223 ac_get_arg(&ctx
->ac
, ctx
->es2gs_offset
),
224 (4 * param
+ chan
) * 4, ac_glc
| ac_slc
| ac_swizzled
);
228 if (ctx
->screen
->info
.chip_class
>= GFX9
)
229 si_set_es_return_value_for_gs(ctx
);
232 static LLVMValueRef
si_get_gs_wave_id(struct si_shader_context
*ctx
)
234 if (ctx
->screen
->info
.chip_class
>= GFX9
)
235 return si_unpack_param(ctx
, ctx
->merged_wave_info
, 16, 8);
237 return ac_get_arg(&ctx
->ac
, ctx
->gs_wave_id
);
240 static void emit_gs_epilogue(struct si_shader_context
*ctx
)
242 if (ctx
->shader
->key
.as_ngg
) {
243 gfx10_ngg_gs_emit_epilogue(ctx
);
247 if (ctx
->screen
->info
.chip_class
>= GFX10
)
248 LLVMBuildFence(ctx
->ac
.builder
, LLVMAtomicOrderingRelease
, false, "");
250 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_NOP
| AC_SENDMSG_GS_DONE
, si_get_gs_wave_id(ctx
));
252 if (ctx
->screen
->info
.chip_class
>= GFX9
)
253 ac_build_endif(&ctx
->ac
, ctx
->merged_wrap_if_label
);
256 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
,
259 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
260 struct si_shader_info UNUSED
*info
= &ctx
->shader
->selector
->info
;
262 assert(info
->num_outputs
<= max_outputs
);
264 emit_gs_epilogue(ctx
);
267 /* Emit one vertex from the geometry shader */
268 static void si_llvm_emit_vertex(struct ac_shader_abi
*abi
, unsigned stream
, LLVMValueRef
*addrs
)
270 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
272 if (ctx
->shader
->key
.as_ngg
) {
273 gfx10_ngg_gs_emit_vertex(ctx
, stream
, addrs
);
277 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
278 struct si_shader
*shader
= ctx
->shader
;
279 LLVMValueRef soffset
= ac_get_arg(&ctx
->ac
, ctx
->gs2vs_offset
);
280 LLVMValueRef gs_next_vertex
;
281 LLVMValueRef can_emit
;
282 unsigned chan
, offset
;
285 /* Write vertex attribute values to GSVS ring */
286 gs_next_vertex
= LLVMBuildLoad(ctx
->ac
.builder
, ctx
->gs_next_vertex
[stream
], "");
288 /* If this thread has already emitted the declared maximum number of
289 * vertices, skip the write: excessive vertex emissions are not
290 * supposed to have any effect.
292 * If the shader has no writes to memory, kill it instead. This skips
293 * further memory loads and may allow LLVM to skip to the end
297 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
, gs_next_vertex
,
298 LLVMConstInt(ctx
->ac
.i32
, shader
->selector
->gs_max_out_vertices
, 0), "");
300 bool use_kill
= !info
->writes_memory
;
302 ac_build_kill_if_false(&ctx
->ac
, can_emit
);
304 ac_build_ifcc(&ctx
->ac
, can_emit
, 6505);
308 for (i
= 0; i
< info
->num_outputs
; i
++) {
309 for (chan
= 0; chan
< 4; chan
++) {
310 if (!(info
->output_usagemask
[i
] & (1 << chan
)) ||
311 ((info
->output_streams
[i
] >> (2 * chan
)) & 3) != stream
)
314 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
315 LLVMValueRef voffset
=
316 LLVMConstInt(ctx
->ac
.i32
, offset
* shader
->selector
->gs_max_out_vertices
, 0);
319 voffset
= LLVMBuildAdd(ctx
->ac
.builder
, voffset
, gs_next_vertex
, "");
320 voffset
= LLVMBuildMul(ctx
->ac
.builder
, voffset
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), "");
322 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
324 ac_build_buffer_store_dword(&ctx
->ac
, ctx
->gsvs_ring
[stream
], out_val
, 1, voffset
, soffset
,
325 0, ac_glc
| ac_slc
| ac_swizzled
);
329 gs_next_vertex
= LLVMBuildAdd(ctx
->ac
.builder
, gs_next_vertex
, ctx
->ac
.i32_1
, "");
330 LLVMBuildStore(ctx
->ac
.builder
, gs_next_vertex
, ctx
->gs_next_vertex
[stream
]);
332 /* Signal vertex emission if vertex data was written. */
334 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_EMIT
| AC_SENDMSG_GS
| (stream
<< 8),
335 si_get_gs_wave_id(ctx
));
339 ac_build_endif(&ctx
->ac
, 6505);
342 /* Cut one primitive from the geometry shader */
343 static void si_llvm_emit_primitive(struct ac_shader_abi
*abi
, unsigned stream
)
345 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
347 if (ctx
->shader
->key
.as_ngg
) {
348 LLVMBuildStore(ctx
->ac
.builder
, ctx
->ac
.i32_0
, ctx
->gs_curprim_verts
[stream
]);
352 /* Signal primitive cut */
353 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_CUT
| AC_SENDMSG_GS
| (stream
<< 8),
354 si_get_gs_wave_id(ctx
));
357 void si_preload_esgs_ring(struct si_shader_context
*ctx
)
359 if (ctx
->screen
->info
.chip_class
<= GFX8
) {
360 unsigned ring
= ctx
->stage
== MESA_SHADER_GEOMETRY
? SI_GS_RING_ESGS
: SI_ES_RING_ESGS
;
361 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, ring
, 0);
362 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
364 ctx
->esgs_ring
= ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
366 if (USE_LDS_SYMBOLS
&& LLVM_VERSION_MAJOR
>= 9) {
367 /* Declare the ESGS ring as an explicit LDS symbol. */
368 si_llvm_declare_esgs_ring(ctx
);
370 ac_declare_lds_as_pointer(&ctx
->ac
);
371 ctx
->esgs_ring
= ctx
->ac
.lds
;
376 void si_preload_gs_rings(struct si_shader_context
*ctx
)
378 const struct si_shader_selector
*sel
= ctx
->shader
->selector
;
379 LLVMBuilderRef builder
= ctx
->ac
.builder
;
380 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, SI_RING_GSVS
, 0);
381 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
382 LLVMValueRef base_ring
= ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
384 /* The conceptual layout of the GSVS ring is
385 * v0c0 .. vLv0 v0c1 .. vLc1 ..
386 * but the real memory layout is swizzled across
388 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
390 * Override the buffer descriptor accordingly.
392 LLVMTypeRef v2i64
= LLVMVectorType(ctx
->ac
.i64
, 2);
393 uint64_t stream_offset
= 0;
395 for (unsigned stream
= 0; stream
< 4; ++stream
) {
396 unsigned num_components
;
398 unsigned num_records
;
399 LLVMValueRef ring
, tmp
;
401 num_components
= sel
->info
.num_stream_output_components
[stream
];
405 stride
= 4 * num_components
* sel
->gs_max_out_vertices
;
407 /* Limit on the stride field for <= GFX7. */
408 assert(stride
< (1 << 14));
410 num_records
= ctx
->ac
.wave_size
;
412 ring
= LLVMBuildBitCast(builder
, base_ring
, v2i64
, "");
413 tmp
= LLVMBuildExtractElement(builder
, ring
, ctx
->ac
.i32_0
, "");
414 tmp
= LLVMBuildAdd(builder
, tmp
, LLVMConstInt(ctx
->ac
.i64
, stream_offset
, 0), "");
415 stream_offset
+= stride
* ctx
->ac
.wave_size
;
417 ring
= LLVMBuildInsertElement(builder
, ring
, tmp
, ctx
->ac
.i32_0
, "");
418 ring
= LLVMBuildBitCast(builder
, ring
, ctx
->ac
.v4i32
, "");
419 tmp
= LLVMBuildExtractElement(builder
, ring
, ctx
->ac
.i32_1
, "");
422 LLVMConstInt(ctx
->ac
.i32
, S_008F04_STRIDE(stride
) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
423 ring
= LLVMBuildInsertElement(builder
, ring
, tmp
, ctx
->ac
.i32_1
, "");
424 ring
= LLVMBuildInsertElement(builder
, ring
, LLVMConstInt(ctx
->ac
.i32
, num_records
, 0),
425 LLVMConstInt(ctx
->ac
.i32
, 2, 0), "");
428 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
429 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
430 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
431 S_008F0C_ADD_TID_ENABLE(1);
433 if (ctx
->ac
.chip_class
>= GFX10
) {
434 rsrc3
|= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT
) |
435 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED
) | S_008F0C_RESOURCE_LEVEL(1);
437 rsrc3
|= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
438 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
) |
439 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
442 ring
= LLVMBuildInsertElement(builder
, ring
, LLVMConstInt(ctx
->ac
.i32
, rsrc3
, false),
443 LLVMConstInt(ctx
->ac
.i32
, 3, 0), "");
445 ctx
->gsvs_ring
[stream
] = ring
;
449 /* Generate code for the hardware VS shader stage to go with a geometry shader */
450 struct si_shader
*si_generate_gs_copy_shader(struct si_screen
*sscreen
,
451 struct ac_llvm_compiler
*compiler
,
452 struct si_shader_selector
*gs_selector
,
453 struct pipe_debug_callback
*debug
)
455 struct si_shader_context ctx
;
456 struct si_shader
*shader
;
457 LLVMBuilderRef builder
;
458 struct si_shader_output_values outputs
[SI_MAX_VS_OUTPUTS
];
459 struct si_shader_info
*gsinfo
= &gs_selector
->info
;
462 shader
= CALLOC_STRUCT(si_shader
);
466 /* We can leave the fence as permanently signaled because the GS copy
467 * shader only becomes visible globally after it has been compiled. */
468 util_queue_fence_init(&shader
->ready
);
470 shader
->selector
= gs_selector
;
471 shader
->is_gs_copy_shader
= true;
473 si_llvm_context_init(&ctx
, sscreen
, compiler
,
474 si_get_wave_size(sscreen
, MESA_SHADER_VERTEX
,
475 false, false, false, false));
477 ctx
.stage
= MESA_SHADER_VERTEX
;
479 builder
= ctx
.ac
.builder
;
481 si_create_function(&ctx
, false);
483 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
.ac
, ctx
.rw_buffers
);
485 ac_build_load_to_sgpr(&ctx
.ac
, buf_ptr
, LLVMConstInt(ctx
.ac
.i32
, SI_RING_GSVS
, 0));
487 LLVMValueRef voffset
=
488 LLVMBuildMul(ctx
.ac
.builder
, ctx
.abi
.vertex_id
, LLVMConstInt(ctx
.ac
.i32
, 4, 0), "");
490 /* Fetch the vertex stream ID.*/
491 LLVMValueRef stream_id
;
493 if (!sscreen
->use_ngg_streamout
&& gs_selector
->so
.num_outputs
)
494 stream_id
= si_unpack_param(&ctx
, ctx
.streamout_config
, 24, 2);
496 stream_id
= ctx
.ac
.i32_0
;
498 /* Fill in output information. */
499 for (i
= 0; i
< gsinfo
->num_outputs
; ++i
) {
500 outputs
[i
].semantic
= gsinfo
->output_semantic
[i
];
502 for (int chan
= 0; chan
< 4; chan
++) {
503 outputs
[i
].vertex_stream
[chan
] = (gsinfo
->output_streams
[i
] >> (2 * chan
)) & 3;
507 LLVMBasicBlockRef end_bb
;
508 LLVMValueRef switch_inst
;
510 end_bb
= LLVMAppendBasicBlockInContext(ctx
.ac
.context
, ctx
.main_fn
, "end");
511 switch_inst
= LLVMBuildSwitch(builder
, stream_id
, end_bb
, 4);
513 for (int stream
= 0; stream
< 4; stream
++) {
514 LLVMBasicBlockRef bb
;
517 if (!gsinfo
->num_stream_output_components
[stream
])
520 if (stream
> 0 && !gs_selector
->so
.num_outputs
)
523 bb
= LLVMInsertBasicBlockInContext(ctx
.ac
.context
, end_bb
, "out");
524 LLVMAddCase(switch_inst
, LLVMConstInt(ctx
.ac
.i32
, stream
, 0), bb
);
525 LLVMPositionBuilderAtEnd(builder
, bb
);
527 /* Fetch vertex data from GSVS ring */
529 for (i
= 0; i
< gsinfo
->num_outputs
; ++i
) {
530 for (unsigned chan
= 0; chan
< 4; chan
++) {
531 if (!(gsinfo
->output_usagemask
[i
] & (1 << chan
)) ||
532 outputs
[i
].vertex_stream
[chan
] != stream
) {
533 outputs
[i
].values
[chan
] = LLVMGetUndef(ctx
.ac
.f32
);
537 LLVMValueRef soffset
=
538 LLVMConstInt(ctx
.ac
.i32
, offset
* gs_selector
->gs_max_out_vertices
* 16 * 4, 0);
541 outputs
[i
].values
[chan
] =
542 ac_build_buffer_load(&ctx
.ac
, ctx
.gsvs_ring
[0], 1, ctx
.ac
.i32_0
, voffset
, soffset
, 0,
543 ac_glc
| ac_slc
, true, false);
547 /* Streamout and exports. */
548 if (!sscreen
->use_ngg_streamout
&& gs_selector
->so
.num_outputs
) {
549 si_llvm_emit_streamout(&ctx
, outputs
, gsinfo
->num_outputs
, stream
);
553 si_llvm_build_vs_exports(&ctx
, outputs
, gsinfo
->num_outputs
);
555 LLVMBuildBr(builder
, end_bb
);
558 LLVMPositionBuilderAtEnd(builder
, end_bb
);
560 LLVMBuildRetVoid(ctx
.ac
.builder
);
562 ctx
.stage
= MESA_SHADER_GEOMETRY
; /* override for shader dumping */
563 si_llvm_optimize_module(&ctx
);
566 if (si_compile_llvm(sscreen
, &ctx
.shader
->binary
, &ctx
.shader
->config
, ctx
.compiler
, &ctx
.ac
,
567 debug
, MESA_SHADER_GEOMETRY
, "GS Copy Shader", false)) {
568 if (si_can_dump_shader(sscreen
, MESA_SHADER_GEOMETRY
))
569 fprintf(stderr
, "GS Copy Shader:\n");
570 si_shader_dump(sscreen
, ctx
.shader
, debug
, stderr
, true);
572 if (!ctx
.shader
->config
.scratch_bytes_per_wave
)
573 ok
= si_shader_binary_upload(sscreen
, ctx
.shader
, 0);
578 si_llvm_dispose(&ctx
);
584 si_fix_resource_usage(sscreen
, shader
);
590 * Build the GS prolog function. Rotate the input vertices for triangle strips
593 void si_llvm_build_gs_prolog(struct si_shader_context
*ctx
, union si_shader_part_key
*key
)
595 unsigned num_sgprs
, num_vgprs
;
596 LLVMBuilderRef builder
= ctx
->ac
.builder
;
597 LLVMTypeRef returns
[AC_MAX_ARGS
];
598 LLVMValueRef func
, ret
;
600 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
602 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
603 if (key
->gs_prolog
.states
.gfx9_prev_is_vs
)
604 num_sgprs
= 8 + GFX9_VSGS_NUM_USER_SGPR
;
606 num_sgprs
= 8 + GFX9_TESGS_NUM_USER_SGPR
;
607 num_vgprs
= 5; /* ES inputs are not needed by GS */
609 num_sgprs
= GFX6_GS_NUM_USER_SGPR
+ 2;
613 for (unsigned i
= 0; i
< num_sgprs
; ++i
) {
614 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
);
615 returns
[i
] = ctx
->ac
.i32
;
618 for (unsigned i
= 0; i
< num_vgprs
; ++i
) {
619 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
);
620 returns
[num_sgprs
+ i
] = ctx
->ac
.f32
;
623 /* Create the function. */
624 si_llvm_create_func(ctx
, "gs_prolog", returns
, num_sgprs
+ num_vgprs
, 0);
627 /* Set the full EXEC mask for the prolog, because we are only fiddling
628 * with registers here. The main shader part will set the correct EXEC
631 if (ctx
->screen
->info
.chip_class
>= GFX9
&& !key
->gs_prolog
.is_monolithic
)
632 ac_init_exec_full_mask(&ctx
->ac
);
634 /* Copy inputs to outputs. This should be no-op, as the registers match,
635 * but it will prevent the compiler from overwriting them unintentionally.
637 ret
= ctx
->return_value
;
638 for (unsigned i
= 0; i
< num_sgprs
; i
++) {
639 LLVMValueRef p
= LLVMGetParam(func
, i
);
640 ret
= LLVMBuildInsertValue(builder
, ret
, p
, i
, "");
642 for (unsigned i
= 0; i
< num_vgprs
; i
++) {
643 LLVMValueRef p
= LLVMGetParam(func
, num_sgprs
+ i
);
644 p
= ac_to_float(&ctx
->ac
, p
);
645 ret
= LLVMBuildInsertValue(builder
, ret
, p
, num_sgprs
+ i
, "");
648 if (key
->gs_prolog
.states
.tri_strip_adj_fix
) {
649 /* Remap the input vertices for every other primitive. */
650 const struct ac_arg gfx6_vtx_params
[6] = {
651 {.used
= true, .arg_index
= num_sgprs
}, {.used
= true, .arg_index
= num_sgprs
+ 1},
652 {.used
= true, .arg_index
= num_sgprs
+ 3}, {.used
= true, .arg_index
= num_sgprs
+ 4},
653 {.used
= true, .arg_index
= num_sgprs
+ 5}, {.used
= true, .arg_index
= num_sgprs
+ 6},
655 const struct ac_arg gfx9_vtx_params
[3] = {
656 {.used
= true, .arg_index
= num_sgprs
},
657 {.used
= true, .arg_index
= num_sgprs
+ 1},
658 {.used
= true, .arg_index
= num_sgprs
+ 4},
660 LLVMValueRef vtx_in
[6], vtx_out
[6];
661 LLVMValueRef prim_id
, rotate
;
663 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
664 for (unsigned i
= 0; i
< 3; i
++) {
665 vtx_in
[i
* 2] = si_unpack_param(ctx
, gfx9_vtx_params
[i
], 0, 16);
666 vtx_in
[i
* 2 + 1] = si_unpack_param(ctx
, gfx9_vtx_params
[i
], 16, 16);
669 for (unsigned i
= 0; i
< 6; i
++)
670 vtx_in
[i
] = ac_get_arg(&ctx
->ac
, gfx6_vtx_params
[i
]);
673 prim_id
= LLVMGetParam(func
, num_sgprs
+ 2);
674 rotate
= LLVMBuildTrunc(builder
, prim_id
, ctx
->ac
.i1
, "");
676 for (unsigned i
= 0; i
< 6; ++i
) {
677 LLVMValueRef base
, rotated
;
679 rotated
= vtx_in
[(i
+ 4) % 6];
680 vtx_out
[i
] = LLVMBuildSelect(builder
, rotate
, rotated
, base
, "");
683 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
684 for (unsigned i
= 0; i
< 3; i
++) {
685 LLVMValueRef hi
, out
;
687 hi
= LLVMBuildShl(builder
, vtx_out
[i
* 2 + 1], LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
688 out
= LLVMBuildOr(builder
, vtx_out
[i
* 2], hi
, "");
689 out
= ac_to_float(&ctx
->ac
, out
);
690 ret
= LLVMBuildInsertValue(builder
, ret
, out
, gfx9_vtx_params
[i
].arg_index
, "");
693 for (unsigned i
= 0; i
< 6; i
++) {
696 out
= ac_to_float(&ctx
->ac
, vtx_out
[i
]);
697 ret
= LLVMBuildInsertValue(builder
, ret
, out
, gfx6_vtx_params
[i
].arg_index
, "");
702 LLVMBuildRet(builder
, ret
);
705 void si_llvm_init_gs_callbacks(struct si_shader_context
*ctx
)
707 ctx
->abi
.load_inputs
= si_nir_load_input_gs
;
708 ctx
->abi
.emit_vertex
= si_llvm_emit_vertex
;
709 ctx
->abi
.emit_primitive
= si_llvm_emit_primitive
;
710 ctx
->abi
.emit_outputs
= si_llvm_emit_gs_epilogue
;