src/gallium/drivers/radeonsi/si_shader_llvm_gs.c

   1 /*
   2  * Copyright 2020 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_shader_internal.h"
  26 #include "si_pipe.h"
  27 #include "sid.h"
  28 #include "util/u_memory.h"
  29
  30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
  31 {
  32         /* Return true if the current thread should execute an ES thread. */
  33         return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
  34                              ac_get_thread_id(&ctx->ac),
  35                              si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
  36 }
  37
  38 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
  39 {
  40         /* Return true if the current thread should execute a GS thread. */
  41         return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
  42                              ac_get_thread_id(&ctx->ac),
  43                              si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
  44 }
  45
  46 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
  47                                           unsigned input_index,
  48                                           unsigned vtx_offset_param,
  49                                           LLVMTypeRef type,
  50                                           unsigned swizzle)
  51 {
  52         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  53         struct si_shader *shader = ctx->shader;
  54         LLVMValueRef vtx_offset, soffset;
  55         struct si_shader_info *info = &shader->selector->info;
  56         unsigned semantic_name = info->input_semantic_name[input_index];
  57         unsigned semantic_index = info->input_semantic_index[input_index];
  58         unsigned param;
  59         LLVMValueRef value;
  60
  61         param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
  62
  63         /* GFX9 has the ESGS ring in LDS. */
  64         if (ctx->screen->info.chip_class >= GFX9) {
  65                 unsigned index = vtx_offset_param;
  66
  67                 switch (index / 2) {
  68                 case 0:
  69                         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
  70                                                      index % 2 ? 16 : 0, 16);
  71                         break;
  72                 case 1:
  73                         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
  74                                                      index % 2 ? 16 : 0, 16);
  75                         break;
  76                 case 2:
  77                         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
  78                                                      index % 2 ? 16 : 0, 16);
  79                         break;
  80                 default:
  81                         assert(0);
  82                         return NULL;
  83                 }
  84
  85                 unsigned offset = param * 4 + swizzle;
  86                 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
  87                                           LLVMConstInt(ctx->ac.i32, offset, false), "");
  88
  89                 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
  90                 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
  91                 if (ac_get_type_size(type) == 8) {
  92                         ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
  93                                            &ctx->ac.i32_1, 1, "");
  94                         LLVMValueRef values[2] = {
  95                                 value,
  96                                 LLVMBuildLoad(ctx->ac.builder, ptr, "")
  97                         };
  98                         value = ac_build_gather_values(&ctx->ac, values, 2);
  99                 }
 100                 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 101         }
 102
 103         /* GFX6: input load from the ESGS ring in memory. */
 104         if (swizzle == ~0) {
 105                 LLVMValueRef values[4];
 106                 unsigned chan;
 107                 for (chan = 0; chan < 4; chan++) {
 108                         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
 109                                                              type, chan);
 110                 }
 111                 return ac_build_gather_values(&ctx->ac, values, 4);
 112         }
 113
 114         /* Get the vertex offset parameter on GFX6. */
 115         LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
 116                                                 ctx->gs_vtx_offset[vtx_offset_param]);
 117
 118         vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
 119                                   LLVMConstInt(ctx->ac.i32, 4, 0), "");
 120
 121         soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
 122
 123         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0,
 124                                      vtx_offset, soffset, 0, ac_glc, true, false);
 125         if (ac_get_type_size(type) == 8) {
 126                 LLVMValueRef value2;
 127                 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
 128
 129                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
 130                                               ctx->ac.i32_0, vtx_offset, soffset,
 131                                               0, ac_glc, true, false);
 132                 return si_build_gather_64bit(ctx, type, value, value2);
 133         }
 134         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 135 }
 136
 137 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
 138                                          unsigned location,
 139                                          unsigned driver_location,
 140                                          unsigned component,
 141                                          unsigned num_components,
 142                                          unsigned vertex_index,
 143                                          unsigned const_index,
 144                                          LLVMTypeRef type)
 145 {
 146         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 147
 148         LLVMValueRef value[4];
 149         for (unsigned i = 0; i < num_components; i++) {
 150                 unsigned offset = i;
 151                 if (ac_get_type_size(type) == 8)
 152                         offset *= 2;
 153
 154                 offset += component;
 155                 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
 156                                                              vertex_index, type, offset);
 157         }
 158
 159         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 160 }
 161
 162 /* Pass GS inputs from ES to GS on GFX9. */
 163 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 164 {
 165         LLVMValueRef ret = ctx->return_value;
 166
 167         ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
 168         ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
 169         if (ctx->shader->key.as_ngg)
 170                 ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
 171         else
 172                 ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
 173         ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
 174         ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
 175
 176         ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
 177                                   8 + SI_SGPR_RW_BUFFERS);
 178         ret = si_insert_input_ptr(ctx, ret,
 179                                   ctx->bindless_samplers_and_images,
 180                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
 181         if (ctx->screen->use_ngg) {
 182                 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
 183                                           8 + SI_SGPR_VS_STATE_BITS);
 184         }
 185
 186         unsigned vgpr;
 187         if (ctx->type == PIPE_SHADER_VERTEX)
 188                 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
 189         else
 190                 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
 191
 192         ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
 193         ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
 194         ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
 195         ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
 196         ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
 197         ctx->return_value = ret;
 198 }
 199
 200 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
 201                               LLVMValueRef *addrs)
 202 {
 203         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 204         struct si_shader *es = ctx->shader;
 205         struct si_shader_info *info = &es->selector->info;
 206         LLVMValueRef lds_base = NULL;
 207         unsigned chan;
 208         int i;
 209
 210         if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
 211                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
 212                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
 213                 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
 214                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
 215                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
 216                                                       LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), "");
 217                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
 218                                         LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
 219         }
 220
 221         for (i = 0; i < info->num_outputs; i++) {
 222                 int param;
 223
 224                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
 225                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
 226                         continue;
 227
 228                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
 229                                                       info->output_semantic_index[i], false);
 230
 231                 for (chan = 0; chan < 4; chan++) {
 232                         if (!(info->output_usagemask[i] & (1 << chan)))
 233                                 continue;
 234
 235                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 236                         out_val = ac_to_integer(&ctx->ac, out_val);
 237
 238                         /* GFX9 has the ESGS ring in LDS. */
 239                         if (ctx->screen->info.chip_class >= GFX9) {
 240                                 LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
 241                                 idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
 242                                 ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
 243                                 continue;
 244                         }
 245
 246                         ac_build_buffer_store_dword(&ctx->ac,
 247                                                     ctx->esgs_ring,
 248                                                     out_val, 1, NULL,
 249                                                     ac_get_arg(&ctx->ac, ctx->es2gs_offset),
 250                                                     (4 * param + chan) * 4,
 251                                                     ac_glc | ac_slc | ac_swizzled);
 252                 }
 253         }
 254
 255         if (ctx->screen->info.chip_class >= GFX9)
 256                 si_set_es_return_value_for_gs(ctx);
 257 }
 258
 259 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
 260 {
 261         if (ctx->screen->info.chip_class >= GFX9)
 262                 return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
 263         else
 264                 return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
 265 }
 266
 267 static void emit_gs_epilogue(struct si_shader_context *ctx)
 268 {
 269         if (ctx->shader->key.as_ngg) {
 270                 gfx10_ngg_gs_emit_epilogue(ctx);
 271                 return;
 272         }
 273
 274         if (ctx->screen->info.chip_class >= GFX10)
 275                 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
 276
 277         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
 278                          si_get_gs_wave_id(ctx));
 279
 280         if (ctx->screen->info.chip_class >= GFX9)
 281                 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 282 }
 283
 284 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
 285                                      unsigned max_outputs,
 286                                      LLVMValueRef *addrs)
 287 {
 288         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 289         struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
 290
 291         assert(info->num_outputs <= max_outputs);
 292
 293         emit_gs_epilogue(ctx);
 294 }
 295
 296 /* Emit one vertex from the geometry shader */
 297 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
 298                                 unsigned stream,
 299                                 LLVMValueRef *addrs)
 300 {
 301         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 302
 303         if (ctx->shader->key.as_ngg) {
 304                 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
 305                 return;
 306         }
 307
 308         struct si_shader_info *info = &ctx->shader->selector->info;
 309         struct si_shader *shader = ctx->shader;
 310         LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
 311         LLVMValueRef gs_next_vertex;
 312         LLVMValueRef can_emit;
 313         unsigned chan, offset;
 314         int i;
 315
 316         /* Write vertex attribute values to GSVS ring */
 317         gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
 318                                        ctx->gs_next_vertex[stream],
 319                                        "");
 320
 321         /* If this thread has already emitted the declared maximum number of
 322          * vertices, skip the write: excessive vertex emissions are not
 323          * supposed to have any effect.
 324          *
 325          * If the shader has no writes to memory, kill it instead. This skips
 326          * further memory loads and may allow LLVM to skip to the end
 327          * altogether.
 328          */
 329         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
 330                                  LLVMConstInt(ctx->ac.i32,
 331                                               shader->selector->gs_max_out_vertices, 0), "");
 332
 333         bool use_kill = !info->writes_memory;
 334         if (use_kill) {
 335                 ac_build_kill_if_false(&ctx->ac, can_emit);
 336         } else {
 337                 ac_build_ifcc(&ctx->ac, can_emit, 6505);
 338         }
 339
 340         offset = 0;
 341         for (i = 0; i < info->num_outputs; i++) {
 342                 for (chan = 0; chan < 4; chan++) {
 343                         if (!(info->output_usagemask[i] & (1 << chan)) ||
 344                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
 345                                 continue;
 346
 347                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 348                         LLVMValueRef voffset =
 349                                 LLVMConstInt(ctx->ac.i32, offset *
 350                                              shader->selector->gs_max_out_vertices, 0);
 351                         offset++;
 352
 353                         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
 354                         voffset = LLVMBuildMul(ctx->ac.builder, voffset,
 355                                                LLVMConstInt(ctx->ac.i32, 4, 0), "");
 356
 357                         out_val = ac_to_integer(&ctx->ac, out_val);
 358
 359                         ac_build_buffer_store_dword(&ctx->ac,
 360                                                     ctx->gsvs_ring[stream],
 361                                                     out_val, 1,
 362                                                     voffset, soffset, 0,
 363                                                     ac_glc | ac_slc | ac_swizzled);
 364                 }
 365         }
 366
 367         gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
 368         LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 369
 370         /* Signal vertex emission if vertex data was written. */
 371         if (offset) {
 372                 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
 373                                  si_get_gs_wave_id(ctx));
 374         }
 375
 376         if (!use_kill)
 377                 ac_build_endif(&ctx->ac, 6505);
 378 }
 379
 380 /* Cut one primitive from the geometry shader */
 381 static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
 382                                    unsigned stream)
 383 {
 384         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 385
 386         if (ctx->shader->key.as_ngg) {
 387                 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
 388                 return;
 389         }
 390
 391         /* Signal primitive cut */
 392         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
 393                          si_get_gs_wave_id(ctx));
 394 }
 395
 396 void si_preload_esgs_ring(struct si_shader_context *ctx)
 397 {
 398         if (ctx->screen->info.chip_class <= GFX8) {
 399                 unsigned ring =
 400                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
 401                                                           : SI_ES_RING_ESGS;
 402                 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
 403                 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 404
 405                 ctx->esgs_ring =
 406                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 407         } else {
 408                 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
 409                         /* Declare the ESGS ring as an explicit LDS symbol. */
 410                         si_llvm_declare_esgs_ring(ctx);
 411                 } else {
 412                         ac_declare_lds_as_pointer(&ctx->ac);
 413                         ctx->esgs_ring = ctx->ac.lds;
 414                 }
 415         }
 416 }
 417
 418 void si_preload_gs_rings(struct si_shader_context *ctx)
 419 {
 420         const struct si_shader_selector *sel = ctx->shader->selector;
 421         LLVMBuilderRef builder = ctx->ac.builder;
 422         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
 423         LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 424         LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 425
 426         /* The conceptual layout of the GSVS ring is
 427          *   v0c0 .. vLv0 v0c1 .. vLc1 ..
 428          * but the real memory layout is swizzled across
 429          * threads:
 430          *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
 431          *   t16v0c0 ..
 432          * Override the buffer descriptor accordingly.
 433          */
 434         LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
 435         uint64_t stream_offset = 0;
 436
 437         for (unsigned stream = 0; stream < 4; ++stream) {
 438                 unsigned num_components;
 439                 unsigned stride;
 440                 unsigned num_records;
 441                 LLVMValueRef ring, tmp;
 442
 443                 num_components = sel->info.num_stream_output_components[stream];
 444                 if (!num_components)
 445                         continue;
 446
 447                 stride = 4 * num_components * sel->gs_max_out_vertices;
 448
 449                 /* Limit on the stride field for <= GFX7. */
 450                 assert(stride < (1 << 14));
 451
 452                 num_records = ctx->ac.wave_size;
 453
 454                 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
 455                 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
 456                 tmp = LLVMBuildAdd(builder, tmp,
 457                                    LLVMConstInt(ctx->ac.i64,
 458                                                 stream_offset, 0), "");
 459                 stream_offset += stride * ctx->ac.wave_size;
 460
 461                 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
 462                 ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
 463                 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
 464                 tmp = LLVMBuildOr(builder, tmp,
 465                         LLVMConstInt(ctx->ac.i32,
 466                                      S_008F04_STRIDE(stride) |
 467                                      S_008F04_SWIZZLE_ENABLE(1), 0), "");
 468                 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
 469                 ring = LLVMBuildInsertElement(builder, ring,
 470                                 LLVMConstInt(ctx->ac.i32, num_records, 0),
 471                                 LLVMConstInt(ctx->ac.i32, 2, 0), "");
 472
 473                 uint32_t rsrc3 =
 474                                 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 475                                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 476                                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
 477                                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
 478                                 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
 479                                 S_008F0C_ADD_TID_ENABLE(1);
 480
 481                 if (ctx->ac.chip_class >= GFX10) {
 482                         rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
 483                                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
 484                                  S_008F0C_RESOURCE_LEVEL(1);
 485                 } else {
 486                         rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 487                                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
 488                                  S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
 489                 }
 490
 491                 ring = LLVMBuildInsertElement(builder, ring,
 492                         LLVMConstInt(ctx->ac.i32, rsrc3, false),
 493                         LLVMConstInt(ctx->ac.i32, 3, 0), "");
 494
 495                 ctx->gsvs_ring[stream] = ring;
 496         }
 497 }
 498
 499 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 500 struct si_shader *
 501 si_generate_gs_copy_shader(struct si_screen *sscreen,
 502                            struct ac_llvm_compiler *compiler,
 503                            struct si_shader_selector *gs_selector,
 504                            struct pipe_debug_callback *debug)
 505 {
 506         struct si_shader_context ctx;
 507         struct si_shader *shader;
 508         LLVMBuilderRef builder;
 509         struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
 510         struct si_shader_info *gsinfo = &gs_selector->info;
 511         int i;
 512
 513
 514         shader = CALLOC_STRUCT(si_shader);
 515         if (!shader)
 516                 return NULL;
 517
 518         /* We can leave the fence as permanently signaled because the GS copy
 519          * shader only becomes visible globally after it has been compiled. */
 520         util_queue_fence_init(&shader->ready);
 521
 522         shader->selector = gs_selector;
 523         shader->is_gs_copy_shader = true;
 524
 525         si_llvm_context_init(&ctx, sscreen, compiler,
 526                              si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
 527                                               false, false, false));
 528         ctx.shader = shader;
 529         ctx.type = PIPE_SHADER_VERTEX;
 530
 531         builder = ctx.ac.builder;
 532
 533         si_create_function(&ctx, false);
 534
 535         LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
 536         ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
 537                                                  LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
 538
 539         LLVMValueRef voffset =
 540                 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
 541                              LLVMConstInt(ctx.ac.i32, 4, 0), "");
 542
 543         /* Fetch the vertex stream ID.*/
 544         LLVMValueRef stream_id;
 545
 546         if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
 547                 stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
 548         else
 549                 stream_id = ctx.ac.i32_0;
 550
 551         /* Fill in output information. */
 552         for (i = 0; i < gsinfo->num_outputs; ++i) {
 553                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
 554                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
 555
 556                 for (int chan = 0; chan < 4; chan++) {
 557                         outputs[i].vertex_stream[chan] =
 558                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
 559                 }
 560         }
 561
 562         LLVMBasicBlockRef end_bb;
 563         LLVMValueRef switch_inst;
 564
 565         end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
 566         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
 567
 568         for (int stream = 0; stream < 4; stream++) {
 569                 LLVMBasicBlockRef bb;
 570                 unsigned offset;
 571
 572                 if (!gsinfo->num_stream_output_components[stream])
 573                         continue;
 574
 575                 if (stream > 0 && !gs_selector->so.num_outputs)
 576                         continue;
 577
 578                 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
 579                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
 580                 LLVMPositionBuilderAtEnd(builder, bb);
 581
 582                 /* Fetch vertex data from GSVS ring */
 583                 offset = 0;
 584                 for (i = 0; i < gsinfo->num_outputs; ++i) {
 585                         for (unsigned chan = 0; chan < 4; chan++) {
 586                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
 587                                     outputs[i].vertex_stream[chan] != stream) {
 588                                         outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
 589                                         continue;
 590                                 }
 591
 592                                 LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32,
 593                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
 594                                 offset++;
 595
 596                                 outputs[i].values[chan] =
 597                                         ac_build_buffer_load(&ctx.ac,
 598                                                              ctx.gsvs_ring[0], 1,
 599                                                              ctx.ac.i32_0, voffset,
 600                                                              soffset, 0, ac_glc | ac_slc,
 601                                                              true, false);
 602                         }
 603                 }
 604
 605                 /* Streamout and exports. */
 606                 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
 607                         si_llvm_emit_streamout(&ctx, outputs,
 608                                                gsinfo->num_outputs,
 609                                                stream);
 610                 }
 611
 612                 if (stream == 0)
 613                         si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
 614
 615                 LLVMBuildBr(builder, end_bb);
 616         }
 617
 618         LLVMPositionBuilderAtEnd(builder, end_bb);
 619
 620         LLVMBuildRetVoid(ctx.ac.builder);
 621
 622         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
 623         si_llvm_optimize_module(&ctx);
 624
 625         bool ok = false;
 626         if (si_compile_llvm(sscreen, &ctx.shader->binary,
 627                             &ctx.shader->config, ctx.compiler, &ctx.ac,
 628                             debug, PIPE_SHADER_GEOMETRY,
 629                             "GS Copy Shader", false)) {
 630                 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
 631                         fprintf(stderr, "GS Copy Shader:\n");
 632                 si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
 633
 634                 if (!ctx.shader->config.scratch_bytes_per_wave)
 635                         ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
 636                 else
 637                         ok = true;
 638         }
 639
 640         si_llvm_dispose(&ctx);
 641
 642         if (!ok) {
 643                 FREE(shader);
 644                 shader = NULL;
 645         } else {
 646                 si_fix_resource_usage(sscreen, shader);
 647         }
 648         return shader;
 649 }
 650
 651 /**
 652  * Build the GS prolog function. Rotate the input vertices for triangle strips
 653  * with adjacency.
 654  */
 655 void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
 656                              union si_shader_part_key *key)
 657 {
 658         unsigned num_sgprs, num_vgprs;
 659         LLVMBuilderRef builder = ctx->ac.builder;
 660         LLVMTypeRef returns[AC_MAX_ARGS];
 661         LLVMValueRef func, ret;
 662
 663         memset(&ctx->args, 0, sizeof(ctx->args));
 664
 665         if (ctx->screen->info.chip_class >= GFX9) {
 666                 if (key->gs_prolog.states.gfx9_prev_is_vs)
 667                         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
 668                 else
 669                         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
 670                 num_vgprs = 5; /* ES inputs are not needed by GS */
 671         } else {
 672                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
 673                 num_vgprs = 8;
 674         }
 675
 676         for (unsigned i = 0; i < num_sgprs; ++i) {
 677                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
 678                 returns[i] = ctx->ac.i32;
 679         }
 680
 681         for (unsigned i = 0; i < num_vgprs; ++i) {
 682                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
 683                 returns[num_sgprs + i] = ctx->ac.f32;
 684         }
 685
 686         /* Create the function. */
 687         si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
 688         func = ctx->main_fn;
 689
 690         /* Set the full EXEC mask for the prolog, because we are only fiddling
 691          * with registers here. The main shader part will set the correct EXEC
 692          * mask.
 693          */
 694         if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
 695                 ac_init_exec_full_mask(&ctx->ac);
 696
 697         /* Copy inputs to outputs. This should be no-op, as the registers match,
 698          * but it will prevent the compiler from overwriting them unintentionally.
 699          */
 700         ret = ctx->return_value;
 701         for (unsigned i = 0; i < num_sgprs; i++) {
 702                 LLVMValueRef p = LLVMGetParam(func, i);
 703                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
 704         }
 705         for (unsigned i = 0; i < num_vgprs; i++) {
 706                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
 707                 p = ac_to_float(&ctx->ac, p);
 708                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
 709         }
 710
 711         if (key->gs_prolog.states.tri_strip_adj_fix) {
 712                 /* Remap the input vertices for every other primitive. */
 713                 const struct ac_arg gfx6_vtx_params[6] = {
 714                         { .used = true, .arg_index = num_sgprs },
 715                         { .used = true, .arg_index = num_sgprs + 1 },
 716                         { .used = true, .arg_index = num_sgprs + 3 },
 717                         { .used = true, .arg_index = num_sgprs + 4 },
 718                         { .used = true, .arg_index = num_sgprs + 5 },
 719                         { .used = true, .arg_index = num_sgprs + 6 },
 720                 };
 721                 const struct ac_arg gfx9_vtx_params[3] = {
 722                         { .used = true, .arg_index = num_sgprs },
 723                         { .used = true, .arg_index = num_sgprs + 1 },
 724                         { .used = true, .arg_index = num_sgprs + 4 },
 725                 };
 726                 LLVMValueRef vtx_in[6], vtx_out[6];
 727                 LLVMValueRef prim_id, rotate;
 728
 729                 if (ctx->screen->info.chip_class >= GFX9) {
 730                         for (unsigned i = 0; i < 3; i++) {
 731                                 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
 732                                 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
 733                         }
 734                 } else {
 735                         for (unsigned i = 0; i < 6; i++)
 736                                 vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
 737                 }
 738
 739                 prim_id = LLVMGetParam(func, num_sgprs + 2);
 740                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
 741
 742                 for (unsigned i = 0; i < 6; ++i) {
 743                         LLVMValueRef base, rotated;
 744                         base = vtx_in[i];
 745                         rotated = vtx_in[(i + 4) % 6];
 746                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
 747                 }
 748
 749                 if (ctx->screen->info.chip_class >= GFX9) {
 750                         for (unsigned i = 0; i < 3; i++) {
 751                                 LLVMValueRef hi, out;
 752
 753                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
 754                                                   LLVMConstInt(ctx->ac.i32, 16, 0), "");
 755                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
 756                                 out = ac_to_float(&ctx->ac, out);
 757                                 ret = LLVMBuildInsertValue(builder, ret, out,
 758                                                            gfx9_vtx_params[i].arg_index, "");
 759                         }
 760                 } else {
 761                         for (unsigned i = 0; i < 6; i++) {
 762                                 LLVMValueRef out;
 763
 764                                 out = ac_to_float(&ctx->ac, vtx_out[i]);
 765                                 ret = LLVMBuildInsertValue(builder, ret, out,
 766                                                            gfx6_vtx_params[i].arg_index, "");
 767                         }
 768                 }
 769         }
 770
 771         LLVMBuildRet(builder, ret);
 772 }
 773
 774 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
 775 {
 776         ctx->abi.load_inputs = si_nir_load_input_gs;
 777         ctx->abi.emit_vertex = si_llvm_emit_vertex;
 778         ctx->abi.emit_primitive = si_llvm_emit_primitive;
 779         ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
 780 }