src/amd/vulkan/radv_nir_to_llvm.c

   1 /*
   2  * Copyright © 2016 Red Hat.
   3  * Copyright © 2016 Bas Nieuwenhuizen
   4  *
   5  * based in part on anv driver which is:
   6  * Copyright © 2015 Intel Corporation
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a
   9  * copy of this software and associated documentation files (the "Software"),
  10  * to deal in the Software without restriction, including without limitation
  11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12  * and/or sell copies of the Software, and to permit persons to whom the
  13  * Software is furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the next
  16  * paragraph) shall be included in all copies or substantial portions of the
  17  * Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  25  * IN THE SOFTWARE.
  26  */
  27
  28 #include "radv_private.h"
  29 #include "radv_shader.h"
  30 #include "radv_shader_helper.h"
  31 #include "nir/nir.h"
  32
  33 #include <llvm-c/Core.h>
  34 #include <llvm-c/TargetMachine.h>
  35 #include <llvm-c/Transforms/Scalar.h>
  36 #include <llvm-c/Transforms/Utils.h>
  37
  38 #include "sid.h"
  39 #include "ac_binary.h"
  40 #include "ac_llvm_util.h"
  41 #include "ac_llvm_build.h"
  42 #include "ac_shader_abi.h"
  43 #include "ac_shader_util.h"
  44 #include "ac_exp_param.h"
  45
  46 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
  47
  48 struct radv_shader_context {
  49         struct ac_llvm_context ac;
  50         const struct radv_nir_compiler_options *options;
  51         struct radv_shader_variant_info *shader_info;
  52         struct ac_shader_abi abi;
  53
  54         unsigned max_workgroup_size;
  55         LLVMContextRef context;
  56         LLVMValueRef main_function;
  57
  58         LLVMValueRef descriptor_sets[RADV_UD_MAX_SETS];
  59         LLVMValueRef ring_offsets;
  60
  61         LLVMValueRef vertex_buffers;
  62         LLVMValueRef rel_auto_id;
  63         LLVMValueRef vs_prim_id;
  64         LLVMValueRef es2gs_offset;
  65
  66         LLVMValueRef oc_lds;
  67         LLVMValueRef merged_wave_info;
  68         LLVMValueRef tess_factor_offset;
  69         LLVMValueRef tes_rel_patch_id;
  70         LLVMValueRef tes_u;
  71         LLVMValueRef tes_v;
  72
  73         /* HW GS */
  74         /* On gfx10:
  75          *  - bits 0..10: ordered_wave_id
  76          *  - bits 12..20: number of vertices in group
  77          *  - bits 22..30: number of primitives in group
  78          */
  79         LLVMValueRef gs_tg_info;
  80         LLVMValueRef gs2vs_offset;
  81         LLVMValueRef gs_wave_id;
  82         LLVMValueRef gs_vtx_offset[6];
  83
  84         LLVMValueRef esgs_ring;
  85         LLVMValueRef gsvs_ring[4];
  86         LLVMValueRef hs_ring_tess_offchip;
  87         LLVMValueRef hs_ring_tess_factor;
  88
  89         LLVMValueRef persp_sample, persp_center, persp_centroid;
  90         LLVMValueRef linear_sample, linear_center, linear_centroid;
  91
  92         /* Streamout */
  93         LLVMValueRef streamout_buffers;
  94         LLVMValueRef streamout_write_idx;
  95         LLVMValueRef streamout_config;
  96         LLVMValueRef streamout_offset[4];
  97
  98         gl_shader_stage stage;
  99
 100         LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
 101         uint64_t float16_shaded_mask;
 102
 103         uint64_t input_mask;
 104         uint64_t output_mask;
 105
 106         bool is_gs_copy_shader;
 107         LLVMValueRef gs_next_vertex[4];
 108         LLVMValueRef gs_curprim_verts[4];
 109         LLVMValueRef gs_generated_prims[4];
 110         LLVMValueRef gs_ngg_emit;
 111         LLVMValueRef gs_ngg_scratch;
 112         unsigned gs_max_out_vertices;
 113         unsigned gs_output_prim;
 114
 115         unsigned tes_primitive_mode;
 116
 117         uint32_t tcs_patch_outputs_read;
 118         uint64_t tcs_outputs_read;
 119         uint32_t tcs_vertices_per_patch;
 120         uint32_t tcs_num_inputs;
 121         uint32_t tcs_num_patches;
 122         uint32_t max_gsvs_emit_size;
 123         uint32_t gsvs_vertex_size;
 124
 125         LLVMValueRef vertexptr; /* GFX10 only */
 126 };
 127
 128 struct radv_shader_output_values {
 129         LLVMValueRef values[4];
 130         unsigned slot_name;
 131         unsigned slot_index;
 132         unsigned usage_mask;
 133 };
 134
 135 enum radeon_llvm_calling_convention {
 136         RADEON_LLVM_AMDGPU_VS = 87,
 137         RADEON_LLVM_AMDGPU_GS = 88,
 138         RADEON_LLVM_AMDGPU_PS = 89,
 139         RADEON_LLVM_AMDGPU_CS = 90,
 140         RADEON_LLVM_AMDGPU_HS = 93,
 141 };
 142
 143 static inline struct radv_shader_context *
 144 radv_shader_context_from_abi(struct ac_shader_abi *abi)
 145 {
 146         struct radv_shader_context *ctx = NULL;
 147         return container_of(abi, ctx, abi);
 148 }
 149
 150 struct ac_build_if_state
 151 {
 152         struct radv_shader_context *ctx;
 153         LLVMValueRef condition;
 154         LLVMBasicBlockRef entry_block;
 155         LLVMBasicBlockRef true_block;
 156         LLVMBasicBlockRef false_block;
 157         LLVMBasicBlockRef merge_block;
 158 };
 159
 160 static LLVMBasicBlockRef
 161 ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name)
 162 {
 163         LLVMBasicBlockRef current_block;
 164         LLVMBasicBlockRef next_block;
 165         LLVMBasicBlockRef new_block;
 166
 167         /* get current basic block */
 168         current_block = LLVMGetInsertBlock(ctx->ac.builder);
 169
 170         /* chqeck if there's another block after this one */
 171         next_block = LLVMGetNextBasicBlock(current_block);
 172         if (next_block) {
 173                 /* insert the new block before the next block */
 174                 new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
 175         }
 176         else {
 177                 /* append new block after current block */
 178                 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
 179                 new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
 180         }
 181         return new_block;
 182 }
 183
 184 static void
 185 ac_nir_build_if(struct ac_build_if_state *ifthen,
 186                 struct radv_shader_context *ctx,
 187                 LLVMValueRef condition)
 188 {
 189         LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder);
 190
 191         memset(ifthen, 0, sizeof *ifthen);
 192         ifthen->ctx = ctx;
 193         ifthen->condition = condition;
 194         ifthen->entry_block = block;
 195
 196         /* create endif/merge basic block for the phi functions */
 197         ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
 198
 199         /* create/insert true_block before merge_block */
 200         ifthen->true_block =
 201                 LLVMInsertBasicBlockInContext(ctx->context,
 202                                               ifthen->merge_block,
 203                                               "if-true-block");
 204
 205         /* successive code goes into the true block */
 206         LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block);
 207 }
 208
 209 /**
 210  * End a conditional.
 211  */
 212 static void
 213 ac_nir_build_endif(struct ac_build_if_state *ifthen)
 214 {
 215         LLVMBuilderRef builder = ifthen->ctx->ac.builder;
 216
 217         /* Insert branch to the merge block from current block */
 218         LLVMBuildBr(builder, ifthen->merge_block);
 219
 220         /*
 221          * Now patch in the various branch instructions.
 222          */
 223
 224         /* Insert the conditional branch instruction at the end of entry_block */
 225         LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
 226         if (ifthen->false_block) {
 227                 /* we have an else clause */
 228                 LLVMBuildCondBr(builder, ifthen->condition,
 229                                 ifthen->true_block, ifthen->false_block);
 230         }
 231         else {
 232                 /* no else clause */
 233                 LLVMBuildCondBr(builder, ifthen->condition,
 234                                 ifthen->true_block, ifthen->merge_block);
 235         }
 236
 237         /* Resume building code at end of the ifthen->merge_block */
 238         LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
 239 }
 240
 241
 242 static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx)
 243 {
 244         switch (ctx->stage) {
 245         case MESA_SHADER_TESS_CTRL:
 246                 return ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
 247         case MESA_SHADER_TESS_EVAL:
 248                 return ctx->tes_rel_patch_id;
 249                 break;
 250         default:
 251                 unreachable("Illegal stage");
 252         }
 253 }
 254
 255 static unsigned
 256 get_tcs_num_patches(struct radv_shader_context *ctx)
 257 {
 258         unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices;
 259         unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch;
 260         uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
 261         uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
 262         uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
 263         uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
 264         uint32_t output_vertex_size = num_tcs_outputs * 16;
 265         uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
 266         uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 267         unsigned num_patches;
 268         unsigned hardware_lds_size;
 269
 270         /* Ensure that we only need one wave per SIMD so we don't need to check
 271          * resource usage. Also ensures that the number of tcs in and out
 272          * vertices per threadgroup are at most 256.
 273          */
 274         num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
 275         /* Make sure that the data fits in LDS. This assumes the shaders only
 276          * use LDS for the inputs and outputs.
 277          */
 278         hardware_lds_size = 32768;
 279
 280         /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single
 281          * threadgroup, even though there is more than 32 KiB LDS.
 282          *
 283          * Test: dEQP-VK.tessellation.shader_input_output.barrier
 284          */
 285         if (ctx->options->chip_class >= GFX7 && ctx->options->family != CHIP_STONEY)
 286                 hardware_lds_size = 65536;
 287
 288         num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
 289         /* Make sure the output data fits in the offchip buffer */
 290         num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size);
 291         /* Not necessary for correctness, but improves performance. The
 292          * specific value is taken from the proprietary driver.
 293          */
 294         num_patches = MIN2(num_patches, 40);
 295
 296         /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
 297         if (ctx->options->chip_class == GFX6) {
 298                 unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 299                 num_patches = MIN2(num_patches, one_wave);
 300         }
 301         return num_patches;
 302 }
 303
 304 static unsigned
 305 calculate_tess_lds_size(struct radv_shader_context *ctx)
 306 {
 307         unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices;
 308         unsigned num_tcs_output_cp;
 309         unsigned num_tcs_outputs, num_tcs_patch_outputs;
 310         unsigned input_vertex_size, output_vertex_size;
 311         unsigned input_patch_size, output_patch_size;
 312         unsigned pervertex_output_patch_size;
 313         unsigned output_patch0_offset;
 314         unsigned num_patches;
 315         unsigned lds_size;
 316
 317         num_tcs_output_cp = ctx->tcs_vertices_per_patch;
 318         num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
 319         num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
 320
 321         input_vertex_size = ctx->tcs_num_inputs * 16;
 322         output_vertex_size = num_tcs_outputs * 16;
 323
 324         input_patch_size = num_tcs_input_cp * input_vertex_size;
 325
 326         pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
 327         output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 328
 329         num_patches = ctx->tcs_num_patches;
 330         output_patch0_offset = input_patch_size * num_patches;
 331
 332         lds_size = output_patch0_offset + output_patch_size * num_patches;
 333         return lds_size;
 334 }
 335
 336 /* Tessellation shaders pass outputs to the next shader using LDS.
 337  *
 338  * LS outputs = TCS inputs
 339  * TCS outputs = TES inputs
 340  *
 341  * The LDS layout is:
 342  * - TCS inputs for patch 0
 343  * - TCS inputs for patch 1
 344  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 345  * - ...
 346  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 347  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 348  * - TCS outputs for patch 1
 349  * - Per-patch TCS outputs for patch 1
 350  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 351  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 352  * - ...
 353  *
 354  * All three shaders VS(LS), TCS, TES share the same LDS space.
 355  */
 356 static LLVMValueRef
 357 get_tcs_in_patch_stride(struct radv_shader_context *ctx)
 358 {
 359         assert (ctx->stage == MESA_SHADER_TESS_CTRL);
 360         uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
 361         uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
 362
 363         input_patch_size /= 4;
 364         return LLVMConstInt(ctx->ac.i32, input_patch_size, false);
 365 }
 366
 367 static LLVMValueRef
 368 get_tcs_out_patch_stride(struct radv_shader_context *ctx)
 369 {
 370         uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
 371         uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
 372         uint32_t output_vertex_size = num_tcs_outputs * 16;
 373         uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
 374         uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
 375         output_patch_size /= 4;
 376         return LLVMConstInt(ctx->ac.i32, output_patch_size, false);
 377 }
 378
 379 static LLVMValueRef
 380 get_tcs_out_vertex_stride(struct radv_shader_context *ctx)
 381 {
 382         uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
 383         uint32_t output_vertex_size = num_tcs_outputs * 16;
 384         output_vertex_size /= 4;
 385         return LLVMConstInt(ctx->ac.i32, output_vertex_size, false);
 386 }
 387
 388 static LLVMValueRef
 389 get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
 390 {
 391         assert (ctx->stage == MESA_SHADER_TESS_CTRL);
 392         uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
 393         uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
 394         uint32_t output_patch0_offset = input_patch_size;
 395         unsigned num_patches = ctx->tcs_num_patches;
 396
 397         output_patch0_offset *= num_patches;
 398         output_patch0_offset /= 4;
 399         return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 400 }
 401
 402 static LLVMValueRef
 403 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
 404 {
 405         assert (ctx->stage == MESA_SHADER_TESS_CTRL);
 406         uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
 407         uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size;
 408         uint32_t output_patch0_offset = input_patch_size;
 409
 410         uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
 411         uint32_t output_vertex_size = num_tcs_outputs * 16;
 412         uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
 413         unsigned num_patches = ctx->tcs_num_patches;
 414
 415         output_patch0_offset *= num_patches;
 416         output_patch0_offset += pervertex_output_patch_size;
 417         output_patch0_offset /= 4;
 418         return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 419 }
 420
 421 static LLVMValueRef
 422 get_tcs_in_current_patch_offset(struct radv_shader_context *ctx)
 423 {
 424         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 425         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 426
 427         return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 428 }
 429
 430 static LLVMValueRef
 431 get_tcs_out_current_patch_offset(struct radv_shader_context *ctx)
 432 {
 433         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 434         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 435         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 436
 437         return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id,
 438                              patch0_offset);
 439 }
 440
 441 static LLVMValueRef
 442 get_tcs_out_current_patch_data_offset(struct radv_shader_context *ctx)
 443 {
 444         LLVMValueRef patch0_patch_data_offset =
 445                 get_tcs_out_patch0_patch_data_offset(ctx);
 446         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 447         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 448
 449         return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id,
 450                              patch0_patch_data_offset);
 451 }
 452
 453 #define MAX_ARGS 64
 454 struct arg_info {
 455         LLVMTypeRef types[MAX_ARGS];
 456         LLVMValueRef *assign[MAX_ARGS];
 457         uint8_t count;
 458         uint8_t sgpr_count;
 459         uint8_t num_sgprs_used;
 460         uint8_t num_vgprs_used;
 461 };
 462
 463 enum ac_arg_regfile {
 464         ARG_SGPR,
 465         ARG_VGPR,
 466 };
 467
 468 static void
 469 add_arg(struct arg_info *info, enum ac_arg_regfile regfile, LLVMTypeRef type,
 470         LLVMValueRef *param_ptr)
 471 {
 472         assert(info->count < MAX_ARGS);
 473
 474         info->assign[info->count] = param_ptr;
 475         info->types[info->count] = type;
 476         info->count++;
 477
 478         if (regfile == ARG_SGPR) {
 479                 info->num_sgprs_used += ac_get_type_size(type) / 4;
 480                 info->sgpr_count++;
 481         } else {
 482                 assert(regfile == ARG_VGPR);
 483                 info->num_vgprs_used += ac_get_type_size(type) / 4;
 484         }
 485 }
 486
 487 static void assign_arguments(LLVMValueRef main_function,
 488                              struct arg_info *info)
 489 {
 490         unsigned i;
 491         for (i = 0; i < info->count; i++) {
 492                 if (info->assign[i])
 493                         *info->assign[i] = LLVMGetParam(main_function, i);
 494         }
 495 }
 496
 497 static LLVMValueRef
 498 create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
 499                      LLVMBuilderRef builder, LLVMTypeRef *return_types,
 500                      unsigned num_return_elems,
 501                      struct arg_info *args,
 502                      unsigned max_workgroup_size,
 503                      const struct radv_nir_compiler_options *options)
 504 {
 505         LLVMTypeRef main_function_type, ret_type;
 506         LLVMBasicBlockRef main_function_body;
 507
 508         if (num_return_elems)
 509                 ret_type = LLVMStructTypeInContext(ctx, return_types,
 510                                                    num_return_elems, true);
 511         else
 512                 ret_type = LLVMVoidTypeInContext(ctx);
 513
 514         /* Setup the function */
 515         main_function_type =
 516             LLVMFunctionType(ret_type, args->types, args->count, 0);
 517         LLVMValueRef main_function =
 518             LLVMAddFunction(module, "main", main_function_type);
 519         main_function_body =
 520             LLVMAppendBasicBlockInContext(ctx, main_function, "main_body");
 521         LLVMPositionBuilderAtEnd(builder, main_function_body);
 522
 523         LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS);
 524         for (unsigned i = 0; i < args->sgpr_count; ++i) {
 525                 LLVMValueRef P = LLVMGetParam(main_function, i);
 526
 527                 ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG);
 528
 529                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
 530                         ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
 531                         ac_add_attr_dereferenceable(P, UINT64_MAX);
 532                 }
 533         }
 534
 535         if (options->address32_hi) {
 536                 ac_llvm_add_target_dep_function_attr(main_function,
 537                                                      "amdgpu-32bit-address-high-bits",
 538                                                      options->address32_hi);
 539         }
 540
 541         ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
 542
 543         if (options->unsafe_math) {
 544                 /* These were copied from some LLVM test. */
 545                 LLVMAddTargetDependentFunctionAttr(main_function,
 546                                                    "less-precise-fpmad",
 547                                                    "true");
 548                 LLVMAddTargetDependentFunctionAttr(main_function,
 549                                                    "no-infs-fp-math",
 550                                                    "true");
 551                 LLVMAddTargetDependentFunctionAttr(main_function,
 552                                                    "no-nans-fp-math",
 553                                                    "true");
 554                 LLVMAddTargetDependentFunctionAttr(main_function,
 555                                                    "unsafe-fp-math",
 556                                                    "true");
 557                 LLVMAddTargetDependentFunctionAttr(main_function,
 558                                            "no-signed-zeros-fp-math",
 559                                            "true");
 560         }
 561         return main_function;
 562 }
 563
 564
 565 static void
 566 set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx,
 567         uint8_t num_sgprs)
 568 {
 569         ud_info->sgpr_idx = *sgpr_idx;
 570         ud_info->num_sgprs = num_sgprs;
 571         *sgpr_idx += num_sgprs;
 572 }
 573
 574 static void
 575 set_loc_shader(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx,
 576                uint8_t num_sgprs)
 577 {
 578         struct radv_userdata_info *ud_info =
 579                 &ctx->shader_info->user_sgprs_locs.shader_data[idx];
 580         assert(ud_info);
 581
 582         set_loc(ud_info, sgpr_idx, num_sgprs);
 583 }
 584
 585 static void
 586 set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx)
 587 {
 588         bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
 589
 590         set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
 591 }
 592
 593 static void
 594 set_loc_desc(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx)
 595 {
 596         struct radv_userdata_locations *locs =
 597                 &ctx->shader_info->user_sgprs_locs;
 598         struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
 599         assert(ud_info);
 600
 601         set_loc(ud_info, sgpr_idx, 1);
 602
 603         locs->descriptor_sets_enabled |= 1 << idx;
 604 }
 605
 606 struct user_sgpr_info {
 607         bool need_ring_offsets;
 608         bool indirect_all_descriptor_sets;
 609         uint8_t remaining_sgprs;
 610 };
 611
 612 static bool needs_view_index_sgpr(struct radv_shader_context *ctx,
 613                                   gl_shader_stage stage)
 614 {
 615         switch (stage) {
 616         case MESA_SHADER_VERTEX:
 617                 if (ctx->shader_info->info.needs_multiview_view_index ||
 618                     (!ctx->options->key.vs_common_out.as_es && !ctx->options->key.vs_common_out.as_ls && ctx->options->key.has_multiview_view_index))
 619                         return true;
 620                 break;
 621         case MESA_SHADER_TESS_EVAL:
 622                 if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs_common_out.as_es && ctx->options->key.has_multiview_view_index))
 623                         return true;
 624                 break;
 625         case MESA_SHADER_GEOMETRY:
 626         case MESA_SHADER_TESS_CTRL:
 627                 if (ctx->shader_info->info.needs_multiview_view_index)
 628                         return true;
 629                 break;
 630         default:
 631                 break;
 632         }
 633         return false;
 634 }
 635
 636 static uint8_t
 637 count_vs_user_sgprs(struct radv_shader_context *ctx)
 638 {
 639         uint8_t count = 0;
 640
 641         if (ctx->shader_info->info.vs.has_vertex_buffers)
 642                 count++;
 643         count += ctx->shader_info->info.vs.needs_draw_id ? 3 : 2;
 644
 645         return count;
 646 }
 647
 648 static void allocate_inline_push_consts(struct radv_shader_context *ctx,
 649                                         struct user_sgpr_info *user_sgpr_info)
 650 {
 651         uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
 652
 653         /* Only supported if shaders use push constants. */
 654         if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX)
 655                 return;
 656
 657         /* Only supported if shaders don't have indirect push constants. */
 658         if (ctx->shader_info->info.has_indirect_push_constants)
 659                 return;
 660
 661         /* Only supported for 32-bit push constants. */
 662         if (!ctx->shader_info->info.has_only_32bit_push_constants)
 663                 return;
 664
 665         uint8_t num_push_consts =
 666                 (ctx->shader_info->info.max_push_constant_used -
 667                  ctx->shader_info->info.min_push_constant_used) / 4;
 668
 669         /* Check if the number of user SGPRs is large enough. */
 670         if (num_push_consts < remaining_sgprs) {
 671                 ctx->shader_info->info.num_inline_push_consts = num_push_consts;
 672         } else {
 673                 ctx->shader_info->info.num_inline_push_consts = remaining_sgprs;
 674         }
 675
 676         /* Clamp to the maximum number of allowed inlined push constants. */
 677         if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
 678                 ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
 679
 680         if (ctx->shader_info->info.num_inline_push_consts == num_push_consts &&
 681             !ctx->shader_info->info.loads_dynamic_offsets) {
 682                 /* Disable the default push constants path if all constants are
 683                  * inlined and if shaders don't use dynamic descriptors.
 684                  */
 685                 ctx->shader_info->info.loads_push_constants = false;
 686         }
 687
 688         ctx->shader_info->info.base_inline_push_consts =
 689                 ctx->shader_info->info.min_push_constant_used / 4;
 690 }
 691
 692 static void allocate_user_sgprs(struct radv_shader_context *ctx,
 693                                 gl_shader_stage stage,
 694                                 bool has_previous_stage,
 695                                 gl_shader_stage previous_stage,
 696                                 bool needs_view_index,
 697                                 struct user_sgpr_info *user_sgpr_info)
 698 {
 699         uint8_t user_sgpr_count = 0;
 700
 701         memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
 702
 703         /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
 704         if (stage == MESA_SHADER_GEOMETRY ||
 705             stage == MESA_SHADER_VERTEX ||
 706             stage == MESA_SHADER_TESS_CTRL ||
 707             stage == MESA_SHADER_TESS_EVAL ||
 708             ctx->is_gs_copy_shader)
 709                 user_sgpr_info->need_ring_offsets = true;
 710
 711         if (stage == MESA_SHADER_FRAGMENT &&
 712             ctx->shader_info->info.ps.needs_sample_positions)
 713                 user_sgpr_info->need_ring_offsets = true;
 714
 715         /* 2 user sgprs will nearly always be allocated for scratch/rings */
 716         if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) {
 717                 user_sgpr_count += 2;
 718         }
 719
 720         switch (stage) {
 721         case MESA_SHADER_COMPUTE:
 722                 if (ctx->shader_info->info.cs.uses_grid_size)
 723                         user_sgpr_count += 3;
 724                 break;
 725         case MESA_SHADER_FRAGMENT:
 726                 user_sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
 727                 break;
 728         case MESA_SHADER_VERTEX:
 729                 if (!ctx->is_gs_copy_shader)
 730                         user_sgpr_count += count_vs_user_sgprs(ctx);
 731                 break;
 732         case MESA_SHADER_TESS_CTRL:
 733                 if (has_previous_stage) {
 734                         if (previous_stage == MESA_SHADER_VERTEX)
 735                                 user_sgpr_count += count_vs_user_sgprs(ctx);
 736                 }
 737                 break;
 738         case MESA_SHADER_TESS_EVAL:
 739                 break;
 740         case MESA_SHADER_GEOMETRY:
 741                 if (has_previous_stage) {
 742                         if (previous_stage == MESA_SHADER_VERTEX) {
 743                                 user_sgpr_count += count_vs_user_sgprs(ctx);
 744                         }
 745                 }
 746                 break;
 747         default:
 748                 break;
 749         }
 750
 751         if (needs_view_index)
 752                 user_sgpr_count++;
 753
 754         if (ctx->shader_info->info.loads_push_constants)
 755                 user_sgpr_count++;
 756
 757         if (ctx->streamout_buffers)
 758                 user_sgpr_count++;
 759
 760         uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
 761         uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
 762         uint32_t num_desc_set =
 763                 util_bitcount(ctx->shader_info->info.desc_set_used_mask);
 764
 765         if (remaining_sgprs < num_desc_set) {
 766                 user_sgpr_info->indirect_all_descriptor_sets = true;
 767                 user_sgpr_info->remaining_sgprs = remaining_sgprs - 1;
 768         } else {
 769                 user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
 770         }
 771
 772         allocate_inline_push_consts(ctx, user_sgpr_info);
 773 }
 774
 775 static void
 776 declare_global_input_sgprs(struct radv_shader_context *ctx,
 777                            const struct user_sgpr_info *user_sgpr_info,
 778                            struct arg_info *args,
 779                            LLVMValueRef *desc_sets)
 780 {
 781         LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8);
 782
 783         /* 1 for each descriptor set */
 784         if (!user_sgpr_info->indirect_all_descriptor_sets) {
 785                 uint32_t mask = ctx->shader_info->info.desc_set_used_mask;
 786
 787                 while (mask) {
 788                         int i = u_bit_scan(&mask);
 789
 790                         add_arg(args, ARG_SGPR, type, &ctx->descriptor_sets[i]);
 791                 }
 792         } else {
 793                 add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(type),
 794                         desc_sets);
 795         }
 796
 797         if (ctx->shader_info->info.loads_push_constants) {
 798                 /* 1 for push constants and dynamic descriptors */
 799                 add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants);
 800         }
 801
 802         for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) {
 803                 add_arg(args, ARG_SGPR, ctx->ac.i32,
 804                         &ctx->abi.inline_push_consts[i]);
 805         }
 806         ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts;
 807         ctx->abi.base_inline_push_consts = ctx->shader_info->info.base_inline_push_consts;
 808
 809         if (ctx->shader_info->info.so.num_outputs) {
 810                 add_arg(args, ARG_SGPR,
 811                         ac_array_in_const32_addr_space(ctx->ac.v4i32),
 812                         &ctx->streamout_buffers);
 813         }
 814 }
 815
 816 static void
 817 declare_vs_specific_input_sgprs(struct radv_shader_context *ctx,
 818                                 gl_shader_stage stage,
 819                                 bool has_previous_stage,
 820                                 gl_shader_stage previous_stage,
 821                                 struct arg_info *args)
 822 {
 823         if (!ctx->is_gs_copy_shader &&
 824             (stage == MESA_SHADER_VERTEX ||
 825              (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 826                 if (ctx->shader_info->info.vs.has_vertex_buffers) {
 827                         add_arg(args, ARG_SGPR,
 828                                 ac_array_in_const32_addr_space(ctx->ac.v4i32),
 829                                 &ctx->vertex_buffers);
 830                 }
 831                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
 832                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
 833                 if (ctx->shader_info->info.vs.needs_draw_id) {
 834                         add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id);
 835                 }
 836         }
 837 }
 838
 839 static void
 840 declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
 841 {
 842         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id);
 843         if (!ctx->is_gs_copy_shader) {
 844                 if (ctx->options->key.vs_common_out.as_ls) {
 845                         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
 846                         if (ctx->ac.chip_class >= GFX10) {
 847                                 add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
 848                                 add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
 849                         } else {
 850                                 add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
 851                                 add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
 852                         }
 853                 } else {
 854                         if (ctx->ac.chip_class >= GFX10) {
 855                                 add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
 856                                 add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */
 857                                 add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
 858                         } else {
 859                                 add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
 860                                 add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
 861                                 add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
 862                         }
 863                 }
 864         }
 865 }
 866
 867 static void
 868 declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage,
 869                         struct arg_info *args)
 870 {
 871         int i;
 872
 873         if (ctx->ac.chip_class >= GFX10)
 874                 return;
 875
 876         /* Streamout SGPRs. */
 877         if (ctx->shader_info->info.so.num_outputs) {
 878                 assert(stage == MESA_SHADER_VERTEX ||
 879                        stage == MESA_SHADER_TESS_EVAL);
 880
 881                 if (stage != MESA_SHADER_TESS_EVAL) {
 882                         add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_config);
 883                 } else {
 884                         args->assign[args->count - 1] = &ctx->streamout_config;
 885                         args->types[args->count - 1] = ctx->ac.i32;
 886                 }
 887
 888                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_write_idx);
 889         }
 890
 891         /* A streamout buffer offset is loaded if the stride is non-zero. */
 892         for (i = 0; i < 4; i++) {
 893                 if (!ctx->shader_info->info.so.strides[i])
 894                         continue;
 895
 896                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_offset[i]);
 897         }
 898 }
 899
 900 static void
 901 declare_tes_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args)
 902 {
 903         add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u);
 904         add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v);
 905         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id);
 906         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
 907 }
 908
 909 static void
 910 set_global_input_locs(struct radv_shader_context *ctx,
 911                       const struct user_sgpr_info *user_sgpr_info,
 912                       LLVMValueRef desc_sets, uint8_t *user_sgpr_idx)
 913 {
 914         uint32_t mask = ctx->shader_info->info.desc_set_used_mask;
 915
 916         if (!user_sgpr_info->indirect_all_descriptor_sets) {
 917                 while (mask) {
 918                         int i = u_bit_scan(&mask);
 919
 920                         set_loc_desc(ctx, i, user_sgpr_idx);
 921                 }
 922         } else {
 923                 set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
 924                                    user_sgpr_idx);
 925
 926                 while (mask) {
 927                         int i = u_bit_scan(&mask);
 928
 929                         ctx->descriptor_sets[i] =
 930                                 ac_build_load_to_sgpr(&ctx->ac, desc_sets,
 931                                                       LLVMConstInt(ctx->ac.i32, i, false));
 932
 933                 }
 934
 935                 ctx->shader_info->need_indirect_descriptor_sets = true;
 936         }
 937
 938         if (ctx->shader_info->info.loads_push_constants) {
 939                 set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
 940         }
 941
 942         if (ctx->shader_info->info.num_inline_push_consts) {
 943                 set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
 944                                ctx->shader_info->info.num_inline_push_consts);
 945         }
 946
 947         if (ctx->streamout_buffers) {
 948                 set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS,
 949                                user_sgpr_idx);
 950         }
 951 }
 952
 953 static void
 954 set_vs_specific_input_locs(struct radv_shader_context *ctx,
 955                            gl_shader_stage stage, bool has_previous_stage,
 956                            gl_shader_stage previous_stage,
 957                            uint8_t *user_sgpr_idx)
 958 {
 959         if (!ctx->is_gs_copy_shader &&
 960             (stage == MESA_SHADER_VERTEX ||
 961              (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 962                 if (ctx->shader_info->info.vs.has_vertex_buffers) {
 963                         set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS,
 964                                            user_sgpr_idx);
 965                 }
 966
 967                 unsigned vs_num = 2;
 968                 if (ctx->shader_info->info.vs.needs_draw_id)
 969                         vs_num++;
 970
 971                 set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE,
 972                                user_sgpr_idx, vs_num);
 973         }
 974 }
 975
 976 static void set_llvm_calling_convention(LLVMValueRef func,
 977                                         gl_shader_stage stage)
 978 {
 979         enum radeon_llvm_calling_convention calling_conv;
 980
 981         switch (stage) {
 982         case MESA_SHADER_VERTEX:
 983         case MESA_SHADER_TESS_EVAL:
 984                 calling_conv = RADEON_LLVM_AMDGPU_VS;
 985                 break;
 986         case MESA_SHADER_GEOMETRY:
 987                 calling_conv = RADEON_LLVM_AMDGPU_GS;
 988                 break;
 989         case MESA_SHADER_TESS_CTRL:
 990                 calling_conv = RADEON_LLVM_AMDGPU_HS;
 991                 break;
 992         case MESA_SHADER_FRAGMENT:
 993                 calling_conv = RADEON_LLVM_AMDGPU_PS;
 994                 break;
 995         case MESA_SHADER_COMPUTE:
 996                 calling_conv = RADEON_LLVM_AMDGPU_CS;
 997                 break;
 998         default:
 999                 unreachable("Unhandle shader type");
1000         }
1001
1002         LLVMSetFunctionCallConv(func, calling_conv);
1003 }
1004
1005 /* Returns whether the stage is a stage that can be directly before the GS */
1006 static bool is_pre_gs_stage(gl_shader_stage stage)
1007 {
1008         return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
1009 }
1010
1011 static void create_function(struct radv_shader_context *ctx,
1012                             gl_shader_stage stage,
1013                             bool has_previous_stage,
1014                             gl_shader_stage previous_stage)
1015 {
1016         uint8_t user_sgpr_idx;
1017         struct user_sgpr_info user_sgpr_info;
1018         struct arg_info args = {};
1019         LLVMValueRef desc_sets;
1020         bool needs_view_index = needs_view_index_sgpr(ctx, stage);
1021
1022         if (ctx->ac.chip_class >= GFX10) {
1023                 if (is_pre_gs_stage(stage) && ctx->options->key.vs_common_out.as_ngg) {
1024                         /* On GFX10, VS is merged into GS for NGG. */
1025                         previous_stage = stage;
1026                         stage = MESA_SHADER_GEOMETRY;
1027                         has_previous_stage = true;
1028                 }
1029         }
1030
1031         allocate_user_sgprs(ctx, stage, has_previous_stage,
1032                             previous_stage, needs_view_index, &user_sgpr_info);
1033
1034         if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
1035                 add_arg(&args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32),
1036                         &ctx->ring_offsets);
1037         }
1038
1039         switch (stage) {
1040         case MESA_SHADER_COMPUTE:
1041                 declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1042                                            &desc_sets);
1043
1044                 if (ctx->shader_info->info.cs.uses_grid_size) {
1045                         add_arg(&args, ARG_SGPR, ctx->ac.v3i32,
1046                                 &ctx->abi.num_work_groups);
1047                 }
1048
1049                 for (int i = 0; i < 3; i++) {
1050                         ctx->abi.workgroup_ids[i] = NULL;
1051                         if (ctx->shader_info->info.cs.uses_block_id[i]) {
1052                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1053                                         &ctx->abi.workgroup_ids[i]);
1054                         }
1055                 }
1056
1057                 if (ctx->shader_info->info.cs.uses_local_invocation_idx)
1058                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.tg_size);
1059                 add_arg(&args, ARG_VGPR, ctx->ac.v3i32,
1060                         &ctx->abi.local_invocation_ids);
1061                 break;
1062         case MESA_SHADER_VERTEX:
1063                 declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1064                                            &desc_sets);
1065
1066                 declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
1067                                                 previous_stage, &args);
1068
1069                 if (needs_view_index)
1070                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1071                                 &ctx->abi.view_index);
1072                 if (ctx->options->key.vs_common_out.as_es) {
1073                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1074                                 &ctx->es2gs_offset);
1075                 } else if (ctx->options->key.vs_common_out.as_ls) {
1076                         /* no extra parameters */
1077                 } else {
1078                         declare_streamout_sgprs(ctx, stage, &args);
1079                 }
1080
1081                 declare_vs_input_vgprs(ctx, &args);
1082                 break;
1083         case MESA_SHADER_TESS_CTRL:
1084                 if (has_previous_stage) {
1085                         // First 6 system regs
1086                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
1087                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1088                                 &ctx->merged_wave_info);
1089                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1090                                 &ctx->tess_factor_offset);
1091
1092                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset
1093                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
1094                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
1095
1096                         declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1097                                                    &desc_sets);
1098
1099                         declare_vs_specific_input_sgprs(ctx, stage,
1100                                                         has_previous_stage,
1101                                                         previous_stage, &args);
1102
1103                         if (needs_view_index)
1104                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1105                                         &ctx->abi.view_index);
1106
1107                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1108                                 &ctx->abi.tcs_patch_id);
1109                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1110                                 &ctx->abi.tcs_rel_ids);
1111
1112                         declare_vs_input_vgprs(ctx, &args);
1113                 } else {
1114                         declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1115                                                    &desc_sets);
1116
1117                         if (needs_view_index)
1118                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1119                                         &ctx->abi.view_index);
1120
1121                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
1122                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1123                                 &ctx->tess_factor_offset);
1124                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1125                                 &ctx->abi.tcs_patch_id);
1126                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1127                                 &ctx->abi.tcs_rel_ids);
1128                 }
1129                 break;
1130         case MESA_SHADER_TESS_EVAL:
1131                 declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1132                                            &desc_sets);
1133
1134                 if (needs_view_index)
1135                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1136                                 &ctx->abi.view_index);
1137
1138                 if (ctx->options->key.vs_common_out.as_es) {
1139                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
1140                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL);
1141                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1142                                 &ctx->es2gs_offset);
1143                 } else {
1144                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL);
1145                         declare_streamout_sgprs(ctx, stage, &args);
1146                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
1147                 }
1148                 declare_tes_input_vgprs(ctx, &args);
1149                 break;
1150         case MESA_SHADER_GEOMETRY:
1151                 if (has_previous_stage) {
1152                         // First 6 system regs
1153                         if (ctx->options->key.vs_common_out.as_ngg) {
1154                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1155                                         &ctx->gs_tg_info);
1156                         } else {
1157                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1158                                         &ctx->gs2vs_offset);
1159                         }
1160
1161                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1162                                 &ctx->merged_wave_info);
1163                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
1164
1165                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset
1166                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
1167                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
1168
1169                         declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1170                                                    &desc_sets);
1171
1172                         if (previous_stage != MESA_SHADER_TESS_EVAL) {
1173                                 declare_vs_specific_input_sgprs(ctx, stage,
1174                                                                 has_previous_stage,
1175                                                                 previous_stage,
1176                                                                 &args);
1177                         }
1178
1179                         if (needs_view_index)
1180                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1181                                         &ctx->abi.view_index);
1182
1183                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1184                                 &ctx->gs_vtx_offset[0]);
1185                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1186                                 &ctx->gs_vtx_offset[2]);
1187                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1188                                 &ctx->abi.gs_prim_id);
1189                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1190                                 &ctx->abi.gs_invocation_id);
1191                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1192                                 &ctx->gs_vtx_offset[4]);
1193
1194                         if (previous_stage == MESA_SHADER_VERTEX) {
1195                                 declare_vs_input_vgprs(ctx, &args);
1196                         } else {
1197                                 declare_tes_input_vgprs(ctx, &args);
1198                         }
1199                 } else {
1200                         declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1201                                                    &desc_sets);
1202
1203                         if (needs_view_index)
1204                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
1205                                         &ctx->abi.view_index);
1206
1207                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs2vs_offset);
1208                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs_wave_id);
1209                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1210                                 &ctx->gs_vtx_offset[0]);
1211                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1212                                 &ctx->gs_vtx_offset[1]);
1213                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1214                                 &ctx->abi.gs_prim_id);
1215                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1216                                 &ctx->gs_vtx_offset[2]);
1217                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1218                                 &ctx->gs_vtx_offset[3]);
1219                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1220                                 &ctx->gs_vtx_offset[4]);
1221                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1222                                 &ctx->gs_vtx_offset[5]);
1223                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
1224                                 &ctx->abi.gs_invocation_id);
1225                 }
1226                 break;
1227         case MESA_SHADER_FRAGMENT:
1228                 declare_global_input_sgprs(ctx, &user_sgpr_info, &args,
1229                                            &desc_sets);
1230
1231                 add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask);
1232                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample);
1233                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center);
1234                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_centroid);
1235                 add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */
1236                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_sample);
1237                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_center);
1238                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_centroid);
1239                 add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL);  /* line stipple tex */
1240                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]);
1241                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]);
1242                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[2]);
1243                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[3]);
1244                 add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.front_face);
1245                 add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.ancillary);
1246                 add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.sample_coverage);
1247                 add_arg(&args, ARG_VGPR, ctx->ac.i32, NULL);  /* fixed pt */
1248                 break;
1249         default:
1250                 unreachable("Shader stage not implemented");
1251         }
1252
1253         ctx->main_function = create_llvm_function(
1254             ctx->context, ctx->ac.module, ctx->ac.builder, NULL, 0, &args,
1255             ctx->max_workgroup_size, ctx->options);
1256         set_llvm_calling_convention(ctx->main_function, stage);
1257
1258
1259         ctx->shader_info->num_input_vgprs = 0;
1260         ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0;
1261
1262         ctx->shader_info->num_input_sgprs += args.num_sgprs_used;
1263
1264         if (ctx->stage != MESA_SHADER_FRAGMENT)
1265                 ctx->shader_info->num_input_vgprs = args.num_vgprs_used;
1266
1267         assign_arguments(ctx->main_function, &args);
1268
1269         user_sgpr_idx = 0;
1270
1271         if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
1272                 set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS,
1273                                    &user_sgpr_idx);
1274                 if (ctx->options->supports_spill) {
1275                         ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
1276                                                                LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST),
1277                                                                NULL, 0, AC_FUNC_ATTR_READNONE);
1278                         ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
1279                                                              ac_array_in_const_addr_space(ctx->ac.v4i32), "");
1280                 }
1281         }
1282
1283         /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
1284          * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
1285         if (has_previous_stage)
1286                 user_sgpr_idx = 0;
1287
1288         set_global_input_locs(ctx, &user_sgpr_info, desc_sets, &user_sgpr_idx);
1289
1290         switch (stage) {
1291         case MESA_SHADER_COMPUTE:
1292                 if (ctx->shader_info->info.cs.uses_grid_size) {
1293                         set_loc_shader(ctx, AC_UD_CS_GRID_SIZE,
1294                                        &user_sgpr_idx, 3);
1295                 }
1296                 break;
1297         case MESA_SHADER_VERTEX:
1298                 set_vs_specific_input_locs(ctx, stage, has_previous_stage,
1299                                            previous_stage, &user_sgpr_idx);
1300                 if (ctx->abi.view_index)
1301                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1302                 break;
1303         case MESA_SHADER_TESS_CTRL:
1304                 set_vs_specific_input_locs(ctx, stage, has_previous_stage,
1305                                            previous_stage, &user_sgpr_idx);
1306                 if (ctx->abi.view_index)
1307                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1308                 break;
1309         case MESA_SHADER_TESS_EVAL:
1310                 if (ctx->abi.view_index)
1311                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1312                 break;
1313         case MESA_SHADER_GEOMETRY:
1314                 if (has_previous_stage) {
1315                         if (previous_stage == MESA_SHADER_VERTEX)
1316                                 set_vs_specific_input_locs(ctx, stage,
1317                                                            has_previous_stage,
1318                                                            previous_stage,
1319                                                            &user_sgpr_idx);
1320                 }
1321                 if (ctx->abi.view_index)
1322                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1323                 break;
1324         case MESA_SHADER_FRAGMENT:
1325                 break;
1326         default:
1327                 unreachable("Shader stage not implemented");
1328         }
1329
1330         if (stage == MESA_SHADER_TESS_CTRL ||
1331             (stage == MESA_SHADER_VERTEX && ctx->options->key.vs_common_out.as_ls) ||
1332             /* GFX9 has the ESGS ring buffer in LDS. */
1333             (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
1334                 ac_declare_lds_as_pointer(&ctx->ac);
1335         }
1336
1337         ctx->shader_info->num_user_sgprs = user_sgpr_idx;
1338 }
1339
1340
1341 static LLVMValueRef
1342 radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
1343                    unsigned desc_set, unsigned binding)
1344 {
1345         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1346         LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
1347         struct radv_pipeline_layout *pipeline_layout = ctx->options->layout;
1348         struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
1349         unsigned base_offset = layout->binding[binding].offset;
1350         LLVMValueRef offset, stride;
1351
1352         if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
1353             layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
1354                 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
1355                         layout->binding[binding].dynamic_offset_offset;
1356                 desc_ptr = ctx->abi.push_constants;
1357                 base_offset = pipeline_layout->push_constant_size + 16 * idx;
1358                 stride = LLVMConstInt(ctx->ac.i32, 16, false);
1359         } else
1360                 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
1361
1362         offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
1363
1364         if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
1365                 offset = ac_build_imad(&ctx->ac, index, stride, offset);
1366         }
1367
1368         desc_ptr = LLVMBuildGEP(ctx->ac.builder, desc_ptr, &offset, 1, "");
1369         desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32);
1370         LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1371
1372         if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
1373                 uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1374                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1375                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1376                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1377                         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1378                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1379
1380                 LLVMValueRef desc_components[4] = {
1381                         LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""),
1382                         LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false),
1383                         /* High limit to support variable sizes. */
1384                         LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
1385                         LLVMConstInt(ctx->ac.i32, desc_type, false),
1386                 };
1387
1388                 return ac_build_gather_values(&ctx->ac, desc_components, 4);
1389         }
1390
1391         return desc_ptr;
1392 }
1393
1394
1395 /* The offchip buffer layout for TCS->TES is
1396  *
1397  * - attribute 0 of patch 0 vertex 0
1398  * - attribute 0 of patch 0 vertex 1
1399  * - attribute 0 of patch 0 vertex 2
1400  *   ...
1401  * - attribute 0 of patch 1 vertex 0
1402  * - attribute 0 of patch 1 vertex 1
1403  *   ...
1404  * - attribute 1 of patch 0 vertex 0
1405  * - attribute 1 of patch 0 vertex 1
1406  *   ...
1407  * - per patch attribute 0 of patch 0
1408  * - per patch attribute 0 of patch 1
1409  *   ...
1410  *
1411  * Note that every attribute has 4 components.
1412  */
1413 static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx)
1414 {
1415         uint32_t num_patches = ctx->tcs_num_patches;
1416         uint32_t num_tcs_outputs;
1417         if (ctx->stage == MESA_SHADER_TESS_CTRL)
1418                 num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
1419         else
1420                 num_tcs_outputs = ctx->options->key.tes.tcs_num_outputs;
1421
1422         uint32_t output_vertex_size = num_tcs_outputs * 16;
1423         uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size;
1424
1425         return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * num_patches, false);
1426 }
1427
1428 static LLVMValueRef calc_param_stride(struct radv_shader_context *ctx,
1429                                       LLVMValueRef vertex_index)
1430 {
1431         LLVMValueRef param_stride;
1432         if (vertex_index)
1433                 param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch * ctx->tcs_num_patches, false);
1434         else
1435                 param_stride = LLVMConstInt(ctx->ac.i32, ctx->tcs_num_patches, false);
1436         return param_stride;
1437 }
1438
1439 static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx,
1440                                                LLVMValueRef vertex_index,
1441                                                LLVMValueRef param_index)
1442 {
1443         LLVMValueRef base_addr;
1444         LLVMValueRef param_stride, constant16;
1445         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
1446         LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, ctx->tcs_vertices_per_patch, false);
1447         constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
1448         param_stride = calc_param_stride(ctx, vertex_index);
1449         if (vertex_index) {
1450                 base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
1451                                           vertices_per_patch, vertex_index);
1452         } else {
1453                 base_addr = rel_patch_id;
1454         }
1455
1456         base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
1457                                  LLVMBuildMul(ctx->ac.builder, param_index,
1458                                               param_stride, ""), "");
1459
1460         base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
1461
1462         if (!vertex_index) {
1463                 LLVMValueRef patch_data_offset = get_non_vertex_index_offset(ctx);
1464
1465                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
1466                                          patch_data_offset, "");
1467         }
1468         return base_addr;
1469 }
1470
1471 static LLVMValueRef get_tcs_tes_buffer_address_params(struct radv_shader_context *ctx,
1472                                                       unsigned param,
1473                                                       unsigned const_index,
1474                                                       bool is_compact,
1475                                                       LLVMValueRef vertex_index,
1476                                                       LLVMValueRef indir_index)
1477 {
1478         LLVMValueRef param_index;
1479
1480         if (indir_index)
1481                 param_index = LLVMBuildAdd(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, param, false),
1482                                            indir_index, "");
1483         else {
1484                 if (const_index && !is_compact)
1485                         param += const_index;
1486                 param_index = LLVMConstInt(ctx->ac.i32, param, false);
1487         }
1488         return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
1489 }
1490
1491 static LLVMValueRef
1492 get_dw_address(struct radv_shader_context *ctx,
1493                LLVMValueRef dw_addr,
1494                unsigned param,
1495                unsigned const_index,
1496                bool compact_const_index,
1497                LLVMValueRef vertex_index,
1498                LLVMValueRef stride,
1499                LLVMValueRef indir_index)
1500
1501 {
1502
1503         if (vertex_index) {
1504                 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1505                                        LLVMBuildMul(ctx->ac.builder,
1506                                                     vertex_index,
1507                                                     stride, ""), "");
1508         }
1509
1510         if (indir_index)
1511                 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1512                                        LLVMBuildMul(ctx->ac.builder, indir_index,
1513                                                     LLVMConstInt(ctx->ac.i32, 4, false), ""), "");
1514         else if (const_index && !compact_const_index)
1515                 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1516                                        LLVMConstInt(ctx->ac.i32, const_index * 4, false), "");
1517
1518         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1519                                LLVMConstInt(ctx->ac.i32, param * 4, false), "");
1520
1521         if (const_index && compact_const_index)
1522                 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1523                                        LLVMConstInt(ctx->ac.i32, const_index, false), "");
1524         return dw_addr;
1525 }
1526
1527 static LLVMValueRef
1528 load_tcs_varyings(struct ac_shader_abi *abi,
1529                   LLVMTypeRef type,
1530                   LLVMValueRef vertex_index,
1531                   LLVMValueRef indir_index,
1532                   unsigned const_index,
1533                   unsigned location,
1534                   unsigned driver_location,
1535                   unsigned component,
1536                   unsigned num_components,
1537                   bool is_patch,
1538                   bool is_compact,
1539                   bool load_input)
1540 {
1541         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1542         LLVMValueRef dw_addr, stride;
1543         LLVMValueRef value[4], result;
1544         unsigned param = shader_io_get_unique_index(location);
1545
1546         if (load_input) {
1547                 uint32_t input_vertex_size = (ctx->tcs_num_inputs * 16) / 4;
1548                 stride = LLVMConstInt(ctx->ac.i32, input_vertex_size, false);
1549                 dw_addr = get_tcs_in_current_patch_offset(ctx);
1550         } else {
1551                 if (!is_patch) {
1552                         stride = get_tcs_out_vertex_stride(ctx);
1553                         dw_addr = get_tcs_out_current_patch_offset(ctx);
1554                 } else {
1555                         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1556                         stride = NULL;
1557                 }
1558         }
1559
1560         dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
1561                                  indir_index);
1562
1563         for (unsigned i = 0; i < num_components + component; i++) {
1564                 value[i] = ac_lds_load(&ctx->ac, dw_addr);
1565                 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1566                                        ctx->ac.i32_1, "");
1567         }
1568         result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1569         return result;
1570 }
1571
1572 static void
1573 store_tcs_output(struct ac_shader_abi *abi,
1574                  const nir_variable *var,
1575                  LLVMValueRef vertex_index,
1576                  LLVMValueRef param_index,
1577                  unsigned const_index,
1578                  LLVMValueRef src,
1579                  unsigned writemask)
1580 {
1581         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1582         const unsigned location = var->data.location;
1583         unsigned component = var->data.location_frac;
1584         const bool is_patch = var->data.patch;
1585         const bool is_compact = var->data.compact;
1586         LLVMValueRef dw_addr;
1587         LLVMValueRef stride = NULL;
1588         LLVMValueRef buf_addr = NULL;
1589         unsigned param;
1590         bool store_lds = true;
1591
1592         if (is_patch) {
1593                 if (!(ctx->tcs_patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0))))
1594                         store_lds = false;
1595         } else {
1596                 if (!(ctx->tcs_outputs_read & (1ULL << location)))
1597                         store_lds = false;
1598         }
1599
1600         param = shader_io_get_unique_index(location);
1601         if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
1602                 const_index += component;
1603                 component = 0;
1604
1605                 if (const_index >= 4) {
1606                         const_index -= 4;
1607                         param++;
1608                 }
1609         }
1610
1611         if (!is_patch) {
1612                 stride = get_tcs_out_vertex_stride(ctx);
1613                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1614         } else {
1615                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1616         }
1617
1618         dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
1619                                  param_index);
1620         buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, is_compact,
1621                                                      vertex_index, param_index);
1622
1623         bool is_tess_factor = false;
1624         if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1625             location == VARYING_SLOT_TESS_LEVEL_OUTER)
1626                 is_tess_factor = true;
1627
1628         unsigned base = is_compact ? const_index : 0;
1629         for (unsigned chan = 0; chan < 8; chan++) {
1630                 if (!(writemask & (1 << chan)))
1631                         continue;
1632                 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1633                 value = ac_to_integer(&ctx->ac, value);
1634                 value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
1635
1636                 if (store_lds || is_tess_factor) {
1637                         LLVMValueRef dw_addr_chan =
1638                                 LLVMBuildAdd(ctx->ac.builder, dw_addr,
1639                                                            LLVMConstInt(ctx->ac.i32, chan, false), "");
1640                         ac_lds_store(&ctx->ac, dw_addr_chan, value);
1641                 }
1642
1643                 if (!is_tess_factor && writemask != 0xF)
1644                         ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1,
1645                                                     buf_addr, ctx->oc_lds,
1646                                                     4 * (base + chan), ac_glc, false);
1647         }
1648
1649         if (writemask == 0xF) {
1650                 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4,
1651                                             buf_addr, ctx->oc_lds,
1652                                             (base * 4), ac_glc, false);
1653         }
1654 }
1655
1656 static LLVMValueRef
1657 load_tes_input(struct ac_shader_abi *abi,
1658                LLVMTypeRef type,
1659                LLVMValueRef vertex_index,
1660                LLVMValueRef param_index,
1661                unsigned const_index,
1662                unsigned location,
1663                unsigned driver_location,
1664                unsigned component,
1665                unsigned num_components,
1666                bool is_patch,
1667                bool is_compact,
1668                bool load_input)
1669 {
1670         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1671         LLVMValueRef buf_addr;
1672         LLVMValueRef result;
1673         unsigned param = shader_io_get_unique_index(location);
1674
1675         if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
1676                 const_index += component;
1677                 component = 0;
1678                 if (const_index >= 4) {
1679                         const_index -= 4;
1680                         param++;
1681                 }
1682         }
1683
1684         buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
1685                                                      is_compact, vertex_index, param_index);
1686
1687         LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, component * 4, false);
1688         buf_addr = LLVMBuildAdd(ctx->ac.builder, buf_addr, comp_offset, "");
1689
1690         result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL,
1691                                       buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false);
1692         result = ac_trim_vector(&ctx->ac, result, num_components);
1693         return result;
1694 }
1695
1696 static LLVMValueRef
1697 load_gs_input(struct ac_shader_abi *abi,
1698               unsigned location,
1699               unsigned driver_location,
1700               unsigned component,
1701               unsigned num_components,
1702               unsigned vertex_index,
1703               unsigned const_index,
1704               LLVMTypeRef type)
1705 {
1706         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1707         LLVMValueRef vtx_offset;
1708         unsigned param, vtx_offset_param;
1709         LLVMValueRef value[4], result;
1710
1711         vtx_offset_param = vertex_index;
1712         assert(vtx_offset_param < 6);
1713         vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
1714                                   LLVMConstInt(ctx->ac.i32, 4, false), "");
1715
1716         param = shader_io_get_unique_index(location);
1717
1718         for (unsigned i = component; i < num_components + component; i++) {
1719                 if (ctx->ac.chip_class >= GFX9) {
1720                         LLVMValueRef dw_addr = ctx->gs_vtx_offset[vtx_offset_param];
1721                         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1722                                                LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
1723                         value[i] = ac_lds_load(&ctx->ac, dw_addr);
1724                 } else {
1725                         LLVMValueRef soffset =
1726                                 LLVMConstInt(ctx->ac.i32,
1727                                              (param * 4 + i + const_index) * 256,
1728                                              false);
1729
1730                         value[i] = ac_build_buffer_load(&ctx->ac,
1731                                                         ctx->esgs_ring, 1,
1732                                                         ctx->ac.i32_0,
1733                                                         vtx_offset, soffset,
1734                                                         0, ac_glc, true, false);
1735                 }
1736
1737                 if (ac_get_type_size(type) == 2) {
1738                         value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], ctx->ac.i32, "");
1739                         value[i] = LLVMBuildTrunc(ctx->ac.builder, value[i], ctx->ac.i16, "");
1740                 }
1741                 value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
1742         }
1743         result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1744         result = ac_to_integer(&ctx->ac, result);
1745         return result;
1746 }
1747
1748
1749 static void radv_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible)
1750 {
1751         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1752         ac_build_kill_if_false(&ctx->ac, visible);
1753 }
1754
1755 static LLVMValueRef lookup_interp_param(struct ac_shader_abi *abi,
1756                                         enum glsl_interp_mode interp, unsigned location)
1757 {
1758         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1759
1760         switch (interp) {
1761         case INTERP_MODE_FLAT:
1762         default:
1763                 return NULL;
1764         case INTERP_MODE_SMOOTH:
1765         case INTERP_MODE_NONE:
1766                 if (location == INTERP_CENTER)
1767                         return ctx->persp_center;
1768                 else if (location == INTERP_CENTROID)
1769                         return ctx->persp_centroid;
1770                 else if (location == INTERP_SAMPLE)
1771                         return ctx->persp_sample;
1772                 break;
1773         case INTERP_MODE_NOPERSPECTIVE:
1774                 if (location == INTERP_CENTER)
1775                         return ctx->linear_center;
1776                 else if (location == INTERP_CENTROID)
1777                         return ctx->linear_centroid;
1778                 else if (location == INTERP_SAMPLE)
1779                         return ctx->linear_sample;
1780                 break;
1781         }
1782         return NULL;
1783 }
1784
1785 static uint32_t
1786 radv_get_sample_pos_offset(uint32_t num_samples)
1787 {
1788         uint32_t sample_pos_offset = 0;
1789
1790         switch (num_samples) {
1791         case 2:
1792                 sample_pos_offset = 1;
1793                 break;
1794         case 4:
1795                 sample_pos_offset = 3;
1796                 break;
1797         case 8:
1798                 sample_pos_offset = 7;
1799                 break;
1800         default:
1801                 break;
1802         }
1803         return sample_pos_offset;
1804 }
1805
1806 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi,
1807                                          LLVMValueRef sample_id)
1808 {
1809         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1810
1811         LLVMValueRef result;
1812         LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
1813         LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
1814
1815         ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
1816                                ac_array_in_const_addr_space(ctx->ac.v2f32), "");
1817
1818         uint32_t sample_pos_offset =
1819                 radv_get_sample_pos_offset(ctx->options->key.fs.num_samples);
1820
1821         sample_id =
1822                 LLVMBuildAdd(ctx->ac.builder, sample_id,
1823                              LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
1824         result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
1825
1826         return result;
1827 }
1828
1829
1830 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
1831 {
1832         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1833         uint8_t log2_ps_iter_samples;
1834
1835         if (ctx->shader_info->info.ps.force_persample) {
1836                 log2_ps_iter_samples =
1837                         util_logbase2(ctx->options->key.fs.num_samples);
1838         } else {
1839                 log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
1840         }
1841
1842         /* The bit pattern matches that used by fixed function fragment
1843          * processing. */
1844         static const uint16_t ps_iter_masks[] = {
1845                 0xffff, /* not used */
1846                 0x5555,
1847                 0x1111,
1848                 0x0101,
1849                 0x0001,
1850         };
1851         assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
1852
1853         uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples];
1854
1855         LLVMValueRef result, sample_id;
1856         sample_id = ac_unpack_param(&ctx->ac, abi->ancillary, 8, 4);
1857         sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, "");
1858         result = LLVMBuildAnd(ctx->ac.builder, sample_id, abi->sample_coverage, "");
1859         return result;
1860 }
1861
1862
1863 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
1864                                      unsigned stream,
1865                                      LLVMValueRef *addrs);
1866
1867 static void
1868 visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
1869 {
1870         LLVMValueRef gs_next_vertex;
1871         LLVMValueRef can_emit;
1872         unsigned offset = 0;
1873         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1874
1875         if (ctx->options->key.vs_common_out.as_ngg) {
1876                 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
1877                 return;
1878         }
1879
1880         /* Write vertex attribute values to GSVS ring */
1881         gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
1882                                        ctx->gs_next_vertex[stream],
1883                                        "");
1884
1885         /* If this thread has already emitted the declared maximum number of
1886          * vertices, kill it: excessive vertex emissions are not supposed to
1887          * have any effect, and GS threads have no externally observable
1888          * effects other than emitting vertices.
1889          */
1890         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
1891                                  LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), "");
1892         ac_build_kill_if_false(&ctx->ac, can_emit);
1893
1894         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1895                 unsigned output_usage_mask =
1896                         ctx->shader_info->info.gs.output_usage_mask[i];
1897                 uint8_t output_stream =
1898                         ctx->shader_info->info.gs.output_streams[i];
1899                 LLVMValueRef *out_ptr = &addrs[i * 4];
1900                 int length = util_last_bit(output_usage_mask);
1901
1902                 if (!(ctx->output_mask & (1ull << i)) ||
1903                     output_stream != stream)
1904                         continue;
1905
1906                 for (unsigned j = 0; j < length; j++) {
1907                         if (!(output_usage_mask & (1 << j)))
1908                                 continue;
1909
1910                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
1911                                                              out_ptr[j], "");
1912                         LLVMValueRef voffset =
1913                                 LLVMConstInt(ctx->ac.i32, offset *
1914                                              ctx->gs_max_out_vertices, false);
1915
1916                         offset++;
1917
1918                         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
1919                         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
1920
1921                         out_val = ac_to_integer(&ctx->ac, out_val);
1922                         out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
1923
1924                         ac_build_buffer_store_dword(&ctx->ac,
1925                                                     ctx->gsvs_ring[stream],
1926                                                     out_val, 1,
1927                                                     voffset, ctx->gs2vs_offset, 0,
1928                                                     ac_glc | ac_slc, true);
1929                 }
1930         }
1931
1932         gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex,
1933                                       ctx->ac.i32_1, "");
1934         LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
1935
1936         ac_build_sendmsg(&ctx->ac,
1937                          AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
1938                          ctx->gs_wave_id);
1939 }
1940
1941 static void
1942 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
1943 {
1944         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1945
1946         if (ctx->options->key.vs_common_out.as_ngg) {
1947                 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
1948                 return;
1949         }
1950
1951         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id);
1952 }
1953
1954 static LLVMValueRef
1955 load_tess_coord(struct ac_shader_abi *abi)
1956 {
1957         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1958
1959         LLVMValueRef coord[4] = {
1960                 ctx->tes_u,
1961                 ctx->tes_v,
1962                 ctx->ac.f32_0,
1963                 ctx->ac.f32_0,
1964         };
1965
1966         if (ctx->tes_primitive_mode == GL_TRIANGLES)
1967                 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
1968                                         LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
1969
1970         return ac_build_gather_values(&ctx->ac, coord, 3);
1971 }
1972
1973 static LLVMValueRef
1974 load_patch_vertices_in(struct ac_shader_abi *abi)
1975 {
1976         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1977         return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false);
1978 }
1979
1980
1981 static LLVMValueRef radv_load_base_vertex(struct ac_shader_abi *abi)
1982 {
1983         return abi->base_vertex;
1984 }
1985
1986 static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi,
1987                                    LLVMValueRef buffer_ptr, bool write)
1988 {
1989         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1990         LLVMValueRef result;
1991
1992         LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1993
1994         result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
1995         LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
1996
1997         return result;
1998 }
1999
2000 static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
2001 {
2002         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
2003         LLVMValueRef result;
2004
2005         if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) {
2006                 /* Do not load the descriptor for inlined uniform blocks. */
2007                 return buffer_ptr;
2008         }
2009
2010         LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
2011
2012         result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
2013         LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
2014
2015         return result;
2016 }
2017
2018 static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
2019                                           unsigned descriptor_set,
2020                                           unsigned base_index,
2021                                           unsigned constant_index,
2022                                           LLVMValueRef index,
2023                                           enum ac_descriptor_type desc_type,
2024                                           bool image, bool write,
2025                                           bool bindless)
2026 {
2027         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
2028         LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
2029         struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
2030         struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
2031         unsigned offset = binding->offset;
2032         unsigned stride = binding->size;
2033         unsigned type_size;
2034         LLVMBuilderRef builder = ctx->ac.builder;
2035         LLVMTypeRef type;
2036
2037         assert(base_index < layout->binding_count);
2038
2039         switch (desc_type) {
2040         case AC_DESC_IMAGE:
2041                 type = ctx->ac.v8i32;
2042                 type_size = 32;
2043                 break;
2044         case AC_DESC_FMASK:
2045                 type = ctx->ac.v8i32;
2046                 offset += 32;
2047                 type_size = 32;
2048                 break;
2049         case AC_DESC_SAMPLER:
2050                 type = ctx->ac.v4i32;
2051                 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
2052                         offset += radv_combined_image_descriptor_sampler_offset(binding);
2053                 }
2054
2055                 type_size = 16;
2056                 break;
2057         case AC_DESC_BUFFER:
2058                 type = ctx->ac.v4i32;
2059                 type_size = 16;
2060                 break;
2061         case AC_DESC_PLANE_0:
2062         case AC_DESC_PLANE_1:
2063         case AC_DESC_PLANE_2:
2064                 type = ctx->ac.v8i32;
2065                 type_size = 32;
2066                 offset += 32 * (desc_type - AC_DESC_PLANE_0);
2067                 break;
2068         default:
2069                 unreachable("invalid desc_type\n");
2070         }
2071
2072         offset += constant_index * stride;
2073
2074         if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
2075             (!index || binding->immutable_samplers_equal)) {
2076                 if (binding->immutable_samplers_equal)
2077                         constant_index = 0;
2078
2079                 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
2080
2081                 LLVMValueRef constants[] = {
2082                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
2083                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
2084                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
2085                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
2086                 };
2087                 return ac_build_gather_values(&ctx->ac, constants, 4);
2088         }
2089
2090         assert(stride % type_size == 0);
2091
2092         LLVMValueRef adjusted_index = index;
2093         if (!adjusted_index)
2094                 adjusted_index = ctx->ac.i32_0;
2095
2096         adjusted_index = LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
2097
2098         LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
2099         list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
2100         list = LLVMBuildPointerCast(builder, list,
2101                                     ac_array_in_const32_addr_space(type), "");
2102
2103         LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
2104
2105         /* 3 plane formats always have same size and format for plane 1 & 2, so
2106          * use the tail from plane 1 so that we can store only the first 16 bytes
2107          * of the last plane. */
2108         if (desc_type == AC_DESC_PLANE_2) {
2109                 LLVMValueRef descriptor2 = radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index, AC_DESC_PLANE_1,image, write, bindless);
2110
2111                 LLVMValueRef components[8];
2112                 for (unsigned i = 0; i < 4; ++i)
2113                         components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
2114
2115                 for (unsigned i = 4; i < 8; ++i)
2116                         components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
2117                 descriptor = ac_build_gather_values(&ctx->ac, components, 8);
2118         }
2119
2120         return descriptor;
2121 }
2122
2123 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2124  * so we may need to fix it up. */
2125 static LLVMValueRef
2126 adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
2127                           unsigned adjustment,
2128                           LLVMValueRef alpha)
2129 {
2130         if (adjustment == RADV_ALPHA_ADJUST_NONE)
2131                 return alpha;
2132
2133         LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
2134
2135         alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
2136
2137         if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2138                 alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
2139         else
2140                 alpha = ac_to_integer(&ctx->ac, alpha);
2141
2142         /* For the integer-like cases, do a natural sign extension.
2143          *
2144          * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2145          * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2146          * exponent.
2147          */
2148         alpha = LLVMBuildShl(ctx->ac.builder, alpha,
2149                              adjustment == RADV_ALPHA_ADJUST_SNORM ?
2150                              LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
2151         alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
2152
2153         /* Convert back to the right type. */
2154         if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2155                 LLVMValueRef clamp;
2156                 LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
2157                 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
2158                 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
2159                 alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
2160         } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2161                 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
2162         }
2163
2164         return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
2165 }
2166
2167 static unsigned
2168 get_num_channels_from_data_format(unsigned data_format)
2169 {
2170         switch (data_format) {
2171         case V_008F0C_BUF_DATA_FORMAT_8:
2172         case V_008F0C_BUF_DATA_FORMAT_16:
2173         case V_008F0C_BUF_DATA_FORMAT_32:
2174                 return 1;
2175         case V_008F0C_BUF_DATA_FORMAT_8_8:
2176         case V_008F0C_BUF_DATA_FORMAT_16_16:
2177         case V_008F0C_BUF_DATA_FORMAT_32_32:
2178                 return 2;
2179         case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2180         case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2181         case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2182                 return 3;
2183         case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2184         case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2185         case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2186         case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2187         case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2188                 return 4;
2189         default:
2190                 break;
2191         }
2192
2193         return 4;
2194 }
2195
2196 static LLVMValueRef
2197 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
2198                                 LLVMValueRef value,
2199                                 unsigned num_channels,
2200                                 bool is_float)
2201 {
2202         LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
2203         LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
2204         LLVMValueRef chan[4];
2205
2206         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
2207                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
2208
2209                 if (num_channels == 4 && num_channels == vec_size)
2210                         return value;
2211
2212                 num_channels = MIN2(num_channels, vec_size);
2213
2214                 for (unsigned i = 0; i < num_channels; i++)
2215                         chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
2216         } else {
2217                 if (num_channels) {
2218                         assert(num_channels == 1);
2219                         chan[0] = value;
2220                 }
2221         }
2222
2223         for (unsigned i = num_channels; i < 4; i++) {
2224                 chan[i] = i == 3 ? one : zero;
2225                 chan[i] = ac_to_integer(&ctx->ac, chan[i]);
2226         }
2227
2228         return ac_build_gather_values(&ctx->ac, chan, 4);
2229 }
2230
2231 static void
2232 handle_vs_input_decl(struct radv_shader_context *ctx,
2233                      struct nir_variable *variable)
2234 {
2235         LLVMValueRef t_list_ptr = ctx->vertex_buffers;
2236         LLVMValueRef t_offset;
2237         LLVMValueRef t_list;
2238         LLVMValueRef input;
2239         LLVMValueRef buffer_index;
2240         unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
2241         uint8_t input_usage_mask =
2242                 ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
2243         unsigned num_input_channels = util_last_bit(input_usage_mask);
2244
2245         variable->data.driver_location = variable->data.location * 4;
2246
2247         enum glsl_base_type type = glsl_get_base_type(variable->type);
2248         for (unsigned i = 0; i < attrib_count; ++i) {
2249                 LLVMValueRef output[4];
2250                 unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
2251                 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index];
2252                 unsigned data_format = attrib_format & 0x0f;
2253                 unsigned num_format = (attrib_format >> 4) & 0x07;
2254                 bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT &&
2255                                 num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
2256
2257                 if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
2258                         uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
2259
2260                         if (divisor) {
2261                                 buffer_index = ctx->abi.instance_id;
2262
2263                                 if (divisor != 1) {
2264                                         buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
2265                                                                      LLVMConstInt(ctx->ac.i32, divisor, 0), "");
2266                                 }
2267                         } else {
2268                                 buffer_index = ctx->ac.i32_0;
2269                         }
2270
2271                         buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, "");
2272                 } else
2273                         buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
2274                                                     ctx->abi.base_vertex, "");
2275
2276                 /* Adjust the number of channels to load based on the vertex
2277                  * attribute format.
2278                  */
2279                 unsigned num_format_channels = get_num_channels_from_data_format(data_format);
2280                 unsigned num_channels = MIN2(num_input_channels, num_format_channels);
2281                 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
2282                 unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
2283                 unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
2284
2285                 if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) {
2286                         /* Always load, at least, 3 channels for formats that
2287                          * need to be shuffled because X<->Z.
2288                          */
2289                         num_channels = MAX2(num_channels, 3);
2290                 }
2291
2292                 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
2293                         LLVMValueRef buffer_offset =
2294                                 LLVMConstInt(ctx->ac.i32,
2295                                              attrib_offset / attrib_stride, false);
2296
2297                         buffer_index = LLVMBuildAdd(ctx->ac.builder,
2298                                                     buffer_index,
2299                                                     buffer_offset, "");
2300
2301                         attrib_offset = attrib_offset % attrib_stride;
2302                 }
2303
2304                 t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
2305                 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
2306
2307                 input = ac_build_struct_tbuffer_load(&ctx->ac, t_list,
2308                                                      buffer_index,
2309                                                      LLVMConstInt(ctx->ac.i32, attrib_offset, false),
2310                                                      ctx->ac.i32_0, ctx->ac.i32_0,
2311                                                      num_channels,
2312                                                      data_format, num_format, 0, true);
2313
2314                 if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) {
2315                         LLVMValueRef c[4];
2316                         c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
2317                         c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
2318                         c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
2319                         c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
2320
2321                         input = ac_build_gather_values(&ctx->ac, c, 4);
2322                 }
2323
2324                 input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
2325                                                         is_float);
2326
2327                 for (unsigned chan = 0; chan < 4; chan++) {
2328                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
2329                         output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
2330                         if (type == GLSL_TYPE_FLOAT16) {
2331                                 output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");
2332                                 output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");
2333                         }
2334                 }
2335
2336                 unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3;
2337                 output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
2338
2339                 for (unsigned chan = 0; chan < 4; chan++) {
2340                         output[chan] = ac_to_integer(&ctx->ac, output[chan]);
2341                         if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)
2342                                 output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");
2343
2344                         ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];
2345                 }
2346         }
2347 }
2348
2349 static void
2350 handle_vs_inputs(struct radv_shader_context *ctx,
2351                  struct nir_shader *nir) {
2352         nir_foreach_variable(variable, &nir->inputs)
2353                 handle_vs_input_decl(ctx, variable);
2354 }
2355
2356 static void
2357 prepare_interp_optimize(struct radv_shader_context *ctx,
2358                         struct nir_shader *nir)
2359 {
2360         bool uses_center = false;
2361         bool uses_centroid = false;
2362         nir_foreach_variable(variable, &nir->inputs) {
2363                 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
2364                     variable->data.sample)
2365                         continue;
2366
2367                 if (variable->data.centroid)
2368                         uses_centroid = true;
2369                 else
2370                         uses_center = true;
2371         }
2372
2373         if (uses_center && uses_centroid) {
2374                 LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, "");
2375                 ctx->persp_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->persp_center, ctx->persp_centroid, "");
2376                 ctx->linear_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->linear_center, ctx->linear_centroid, "");
2377         }
2378 }
2379
2380 static void
2381 scan_shader_output_decl(struct radv_shader_context *ctx,
2382                         struct nir_variable *variable,
2383                         struct nir_shader *shader,
2384                         gl_shader_stage stage)
2385 {
2386         int idx = variable->data.location + variable->data.index;
2387         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
2388         uint64_t mask_attribs;
2389
2390         variable->data.driver_location = idx * 4;
2391
2392         /* tess ctrl has it's own load/store paths for outputs */
2393         if (stage == MESA_SHADER_TESS_CTRL)
2394                 return;
2395
2396         if (variable->data.compact) {
2397                 unsigned component_count = variable->data.location_frac +
2398                                            glsl_get_length(variable->type);
2399                 attrib_count = (component_count + 3) / 4;
2400         }
2401
2402         mask_attribs = ((1ull << attrib_count) - 1) << idx;
2403         if (stage == MESA_SHADER_VERTEX ||
2404             stage == MESA_SHADER_TESS_EVAL ||
2405             stage == MESA_SHADER_GEOMETRY) {
2406                 if (idx == VARYING_SLOT_CLIP_DIST0) {
2407                         if (stage == MESA_SHADER_VERTEX) {
2408                                 ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
2409                                 ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
2410                                 ctx->shader_info->vs.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size;
2411                         }
2412                         if (stage == MESA_SHADER_TESS_EVAL) {
2413                                 ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
2414                                 ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
2415                                 ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size;
2416                         }
2417                         if (stage == MESA_SHADER_GEOMETRY) {
2418                                 ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
2419                                 ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
2420                                 ctx->shader_info->vs.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size;
2421                         }
2422                 }
2423         }
2424
2425         ctx->output_mask |= mask_attribs;
2426 }
2427
2428
2429 /* Initialize arguments for the shader export intrinsic */
2430 static void
2431 si_llvm_init_export_args(struct radv_shader_context *ctx,
2432                          LLVMValueRef *values,
2433                          unsigned enabled_channels,
2434                          unsigned target,
2435                          struct ac_export_args *args)
2436 {
2437         /* Specify the channels that are enabled. */
2438         args->enabled_channels = enabled_channels;
2439
2440         /* Specify whether the EXEC mask represents the valid mask */
2441         args->valid_mask = 0;
2442
2443         /* Specify whether this is the last export */
2444         args->done = 0;
2445
2446         /* Specify the target we are exporting */
2447         args->target = target;
2448
2449         args->compr = false;
2450         args->out[0] = LLVMGetUndef(ctx->ac.f32);
2451         args->out[1] = LLVMGetUndef(ctx->ac.f32);
2452         args->out[2] = LLVMGetUndef(ctx->ac.f32);
2453         args->out[3] = LLVMGetUndef(ctx->ac.f32);
2454
2455         if (!values)
2456                 return;
2457
2458         bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
2459         if (ctx->stage == MESA_SHADER_FRAGMENT) {
2460                 unsigned index = target - V_008DFC_SQ_EXP_MRT;
2461                 unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2462                 bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2463                 bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2464                 unsigned chan;
2465
2466                 LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
2467                 LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
2468                                       unsigned bits, bool hi) = NULL;
2469
2470                 switch(col_format) {
2471                 case V_028714_SPI_SHADER_ZERO:
2472                         args->enabled_channels = 0; /* writemask */
2473                         args->target = V_008DFC_SQ_EXP_NULL;
2474                         break;
2475
2476                 case V_028714_SPI_SHADER_32_R:
2477                         args->enabled_channels = 1;
2478                         args->out[0] = values[0];
2479                         break;
2480
2481                 case V_028714_SPI_SHADER_32_GR:
2482                         args->enabled_channels = 0x3;
2483                         args->out[0] = values[0];
2484                         args->out[1] = values[1];
2485                         break;
2486
2487                 case V_028714_SPI_SHADER_32_AR:
2488                         if (ctx->ac.chip_class >= GFX10) {
2489                                 args->enabled_channels = 0x3;
2490                                 args->out[0] = values[0];
2491                                 args->out[1] = values[3];
2492                         } else {
2493                                 args->enabled_channels = 0x9;
2494                                 args->out[0] = values[0];
2495                                 args->out[3] = values[3];
2496                         }
2497                         break;
2498
2499                 case V_028714_SPI_SHADER_FP16_ABGR:
2500                         args->enabled_channels = 0x5;
2501                         packf = ac_build_cvt_pkrtz_f16;
2502                         if (is_16bit) {
2503                                 for (unsigned chan = 0; chan < 4; chan++)
2504                                         values[chan] = LLVMBuildFPExt(ctx->ac.builder,
2505                                                                       values[chan],
2506                                                                       ctx->ac.f32, "");
2507                         }
2508                         break;
2509
2510                 case V_028714_SPI_SHADER_UNORM16_ABGR:
2511                         args->enabled_channels = 0x5;
2512                         packf = ac_build_cvt_pknorm_u16;
2513                         break;
2514
2515                 case V_028714_SPI_SHADER_SNORM16_ABGR:
2516                         args->enabled_channels = 0x5;
2517                         packf = ac_build_cvt_pknorm_i16;
2518                         break;
2519
2520                 case V_028714_SPI_SHADER_UINT16_ABGR:
2521                         args->enabled_channels = 0x5;
2522                         packi = ac_build_cvt_pk_u16;
2523                         if (is_16bit) {
2524                                 for (unsigned chan = 0; chan < 4; chan++)
2525                                         values[chan] = LLVMBuildZExt(ctx->ac.builder,
2526                                                                       ac_to_integer(&ctx->ac, values[chan]),
2527                                                                       ctx->ac.i32, "");
2528                         }
2529                         break;
2530
2531                 case V_028714_SPI_SHADER_SINT16_ABGR:
2532                         args->enabled_channels = 0x5;
2533                         packi = ac_build_cvt_pk_i16;
2534                         if (is_16bit) {
2535                                 for (unsigned chan = 0; chan < 4; chan++)
2536                                         values[chan] = LLVMBuildSExt(ctx->ac.builder,
2537                                                                       ac_to_integer(&ctx->ac, values[chan]),
2538                                                                       ctx->ac.i32, "");
2539                         }
2540                         break;
2541
2542                 default:
2543                 case V_028714_SPI_SHADER_32_ABGR:
2544                         memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2545                         break;
2546                 }
2547
2548                 /* Pack f16 or norm_i16/u16. */
2549                 if (packf) {
2550                         for (chan = 0; chan < 2; chan++) {
2551                                 LLVMValueRef pack_args[2] = {
2552                                         values[2 * chan],
2553                                         values[2 * chan + 1]
2554                                 };
2555                                 LLVMValueRef packed;
2556
2557                                 packed = packf(&ctx->ac, pack_args);
2558                                 args->out[chan] = ac_to_float(&ctx->ac, packed);
2559                         }
2560                         args->compr = 1; /* COMPR flag */
2561                 }
2562
2563                 /* Pack i16/u16. */
2564                 if (packi) {
2565                         for (chan = 0; chan < 2; chan++) {
2566                                 LLVMValueRef pack_args[2] = {
2567                                         ac_to_integer(&ctx->ac, values[2 * chan]),
2568                                         ac_to_integer(&ctx->ac, values[2 * chan + 1])
2569                                 };
2570                                 LLVMValueRef packed;
2571
2572                                 packed = packi(&ctx->ac, pack_args,
2573                                                is_int8 ? 8 : is_int10 ? 10 : 16,
2574                                                chan == 1);
2575                                 args->out[chan] = ac_to_float(&ctx->ac, packed);
2576                         }
2577                         args->compr = 1; /* COMPR flag */
2578                 }
2579                 return;
2580         }
2581
2582         if (is_16bit) {
2583                 for (unsigned chan = 0; chan < 4; chan++) {
2584                         values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
2585                         args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
2586                 }
2587         } else
2588                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2589
2590         for (unsigned i = 0; i < 4; ++i)
2591                 args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
2592 }
2593
2594 static void
2595 radv_export_param(struct radv_shader_context *ctx, unsigned index,
2596                   LLVMValueRef *values, unsigned enabled_channels)
2597 {
2598         struct ac_export_args args;
2599
2600         si_llvm_init_export_args(ctx, values, enabled_channels,
2601                                  V_008DFC_SQ_EXP_PARAM + index, &args);
2602         ac_build_export(&ctx->ac, &args);
2603 }
2604
2605 static LLVMValueRef
2606 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
2607 {
2608         LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
2609         return LLVMBuildLoad(ctx->ac.builder, output, "");
2610 }
2611
2612 static void
2613 radv_emit_stream_output(struct radv_shader_context *ctx,
2614                          LLVMValueRef const *so_buffers,
2615                          LLVMValueRef const *so_write_offsets,
2616                          const struct radv_stream_output *output,
2617                          struct radv_shader_output_values *shader_out)
2618 {
2619         unsigned num_comps = util_bitcount(output->component_mask);
2620         unsigned buf = output->buffer;
2621         unsigned offset = output->offset;
2622         unsigned start;
2623         LLVMValueRef out[4];
2624
2625         assert(num_comps && num_comps <= 4);
2626         if (!num_comps || num_comps > 4)
2627                 return;
2628
2629         /* Get the first component. */
2630         start = ffs(output->component_mask) - 1;
2631
2632         /* Load the output as int. */
2633         for (int i = 0; i < num_comps; i++) {
2634                 out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
2635         }
2636
2637         /* Pack the output. */
2638         LLVMValueRef vdata = NULL;
2639
2640         switch (num_comps) {
2641         case 1: /* as i32 */
2642                 vdata = out[0];
2643                 break;
2644         case 2: /* as v2i32 */
2645         case 3: /* as v4i32 (aligned to 4) */
2646                 out[3] = LLVMGetUndef(ctx->ac.i32);
2647                 /* fall through */
2648         case 4: /* as v4i32 */
2649                 vdata = ac_build_gather_values(&ctx->ac, out,
2650                                                !ac_has_vec3_support(ctx->ac.chip_class, false) ?
2651                                                util_next_power_of_two(num_comps) :
2652                                                num_comps);
2653                 break;
2654         }
2655
2656         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf],
2657                                     vdata, num_comps, so_write_offsets[buf],
2658                                     ctx->ac.i32_0, offset,
2659                                     ac_glc | ac_slc, false);
2660 }
2661
2662 static void
2663 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
2664 {
2665         struct ac_build_if_state if_ctx;
2666         int i;
2667
2668         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2669         assert(ctx->streamout_config);
2670         LLVMValueRef so_vtx_count =
2671                 ac_build_bfe(&ctx->ac, ctx->streamout_config,
2672                              LLVMConstInt(ctx->ac.i32, 16, false),
2673                              LLVMConstInt(ctx->ac.i32, 7, false), false);
2674
2675         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2676
2677         /* can_emit = tid < so_vtx_count; */
2678         LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
2679                                               tid, so_vtx_count, "");
2680
2681         /* Emit the streamout code conditionally. This actually avoids
2682          * out-of-bounds buffer access. The hw tells us via the SGPR
2683          * (so_vtx_count) which threads are allowed to emit streamout data.
2684          */
2685         ac_nir_build_if(&if_ctx, ctx, can_emit);
2686         {
2687                 /* The buffer offset is computed as follows:
2688                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2689                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2690                  *                attrib_offset
2691                  */
2692                 LLVMValueRef so_write_index = ctx->streamout_write_idx;
2693
2694                 /* Compute (streamout_write_index + thread_id). */
2695                 so_write_index =
2696                         LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
2697
2698                 /* Load the descriptor and compute the write offset for each
2699                  * enabled buffer.
2700                  */
2701                 LLVMValueRef so_write_offset[4] = {};
2702                 LLVMValueRef so_buffers[4] = {};
2703                 LLVMValueRef buf_ptr = ctx->streamout_buffers;
2704
2705                 for (i = 0; i < 4; i++) {
2706                         uint16_t stride = ctx->shader_info->info.so.strides[i];
2707
2708                         if (!stride)
2709                                 continue;
2710
2711                         LLVMValueRef offset =
2712                                 LLVMConstInt(ctx->ac.i32, i, false);
2713
2714                         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac,
2715                                                               buf_ptr, offset);
2716
2717                         LLVMValueRef so_offset = ctx->streamout_offset[i];
2718
2719                         so_offset = LLVMBuildMul(ctx->ac.builder, so_offset,
2720                                                  LLVMConstInt(ctx->ac.i32, 4, false), "");
2721
2722                         so_write_offset[i] =
2723                                 ac_build_imad(&ctx->ac, so_write_index,
2724                                               LLVMConstInt(ctx->ac.i32,
2725                                                            stride * 4, false),
2726                                               so_offset);
2727                 }
2728
2729                 /* Write streamout data. */
2730                 for (i = 0; i < ctx->shader_info->info.so.num_outputs; i++) {
2731                         struct radv_shader_output_values shader_out = {};
2732                         struct radv_stream_output *output =
2733                                 &ctx->shader_info->info.so.outputs[i];
2734
2735                         if (stream != output->stream)
2736                                 continue;
2737
2738                         for (int j = 0; j < 4; j++) {
2739                                 shader_out.values[j] =
2740                                         radv_load_output(ctx, output->location, j);
2741                         }
2742
2743                         radv_emit_stream_output(ctx, so_buffers,so_write_offset,
2744                                                 output, &shader_out);
2745                 }
2746         }
2747         ac_nir_build_endif(&if_ctx);
2748 }
2749
2750 static void
2751 radv_build_param_exports(struct radv_shader_context *ctx,
2752                          struct radv_shader_output_values *outputs,
2753                          unsigned noutput,
2754                          struct radv_vs_output_info *outinfo,
2755                          bool export_clip_dists)
2756 {
2757         unsigned param_count = 0;
2758
2759         for (unsigned i = 0; i < noutput; i++) {
2760                 unsigned slot_name = outputs[i].slot_name;
2761                 unsigned usage_mask = outputs[i].usage_mask;
2762
2763                 if (slot_name != VARYING_SLOT_LAYER &&
2764                     slot_name != VARYING_SLOT_PRIMITIVE_ID &&
2765                     slot_name != VARYING_SLOT_CLIP_DIST0 &&
2766                     slot_name != VARYING_SLOT_CLIP_DIST1 &&
2767                     slot_name < VARYING_SLOT_VAR0)
2768                         continue;
2769
2770                 if ((slot_name == VARYING_SLOT_CLIP_DIST0 ||
2771                      slot_name == VARYING_SLOT_CLIP_DIST1) && !export_clip_dists)
2772                         continue;
2773
2774                 radv_export_param(ctx, param_count, outputs[i].values, usage_mask);
2775
2776                 assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset));
2777                 outinfo->vs_output_param_offset[slot_name] = param_count++;
2778         }
2779
2780         outinfo->param_exports = param_count;
2781 }
2782
2783 /* Generate export instructions for hardware VS shader stage or NGG GS stage
2784  * (position and parameter data only).
2785  */
2786 static void
2787 radv_llvm_export_vs(struct radv_shader_context *ctx,
2788                     struct radv_shader_output_values *outputs,
2789                     unsigned noutput,
2790                     struct radv_vs_output_info *outinfo,
2791                     bool export_clip_dists)
2792 {
2793         LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
2794         struct ac_export_args pos_args[4] = {};
2795         unsigned pos_idx, index;
2796         int i;
2797
2798         /* Build position exports */
2799         for (i = 0; i < noutput; i++) {
2800                 switch (outputs[i].slot_name) {
2801                 case VARYING_SLOT_POS:
2802                         si_llvm_init_export_args(ctx, outputs[i].values, 0xf,
2803                                                  V_008DFC_SQ_EXP_POS, &pos_args[0]);
2804                         break;
2805                 case VARYING_SLOT_PSIZ:
2806                         psize_value = outputs[i].values[0];
2807                         break;
2808                 case VARYING_SLOT_LAYER:
2809                         layer_value = outputs[i].values[0];
2810                         break;
2811                 case VARYING_SLOT_VIEWPORT:
2812                         viewport_value = outputs[i].values[0];
2813                         break;
2814                 case VARYING_SLOT_CLIP_DIST0:
2815                 case VARYING_SLOT_CLIP_DIST1:
2816                         index = 2 + outputs[i].slot_index;
2817                         si_llvm_init_export_args(ctx, outputs[i].values, 0xf,
2818                                                  V_008DFC_SQ_EXP_POS + index,
2819                                                  &pos_args[index]);
2820                         break;
2821                 default:
2822                         break;
2823                 }
2824         }
2825
2826         /* We need to add the position output manually if it's missing. */
2827         if (!pos_args[0].out[0]) {
2828                 pos_args[0].enabled_channels = 0xf; /* writemask */
2829                 pos_args[0].valid_mask = 0; /* EXEC mask */
2830                 pos_args[0].done = 0; /* last export? */
2831                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2832                 pos_args[0].compr = 0; /* COMPR flag */
2833                 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2834                 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2835                 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2836                 pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
2837         }
2838
2839         if (outinfo->writes_pointsize ||
2840             outinfo->writes_layer ||
2841             outinfo->writes_viewport_index) {
2842                 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
2843                                                 (outinfo->writes_layer == true ? 4 : 0));
2844                 pos_args[1].valid_mask = 0;
2845                 pos_args[1].done = 0;
2846                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2847                 pos_args[1].compr = 0;
2848                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2849                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2850                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2851                 pos_args[1].out[3] = ctx->ac.f32_0;  /* W */
2852
2853                 if (outinfo->writes_pointsize == true)
2854                         pos_args[1].out[0] = psize_value;
2855                 if (outinfo->writes_layer == true)
2856                         pos_args[1].out[2] = layer_value;
2857                 if (outinfo->writes_viewport_index == true) {
2858                         if (ctx->options->chip_class >= GFX9) {
2859                                 /* GFX9 has the layer in out.z[10:0] and the viewport
2860                                  * index in out.z[19:16].
2861                                  */
2862                                 LLVMValueRef v = viewport_value;
2863                                 v = ac_to_integer(&ctx->ac, v);
2864                                 v = LLVMBuildShl(ctx->ac.builder, v,
2865                                                  LLVMConstInt(ctx->ac.i32, 16, false),
2866                                                  "");
2867                                 v = LLVMBuildOr(ctx->ac.builder, v,
2868                                                 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
2869
2870                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2871                                 pos_args[1].enabled_channels |= 1 << 2;
2872                         } else {
2873                                 pos_args[1].out[3] = viewport_value;
2874                                 pos_args[1].enabled_channels |= 1 << 3;
2875                         }
2876                 }
2877         }
2878
2879         for (i = 0; i < 4; i++) {
2880                 if (pos_args[i].out[0])
2881                         outinfo->pos_exports++;
2882         }
2883
2884         /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
2885          * Setting valid_mask=1 prevents it and has no other effect.
2886          */
2887         if (ctx->ac.family == CHIP_NAVI10 ||
2888             ctx->ac.family == CHIP_NAVI12 ||
2889             ctx->ac.family == CHIP_NAVI14)
2890                 pos_args[0].valid_mask = 1;
2891
2892         pos_idx = 0;
2893         for (i = 0; i < 4; i++) {
2894                 if (!pos_args[i].out[0])
2895                         continue;
2896
2897                 /* Specify the target we are exporting */
2898                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2899
2900                 if (pos_idx == outinfo->pos_exports)
2901                         /* Specify that this is the last export */
2902                         pos_args[i].done = 1;
2903
2904                 ac_build_export(&ctx->ac, &pos_args[i]);
2905         }
2906
2907         /* Build parameter exports */
2908         radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
2909 }
2910
2911 static void
2912 handle_vs_outputs_post(struct radv_shader_context *ctx,
2913                        bool export_prim_id,
2914                        bool export_clip_dists,
2915                        struct radv_vs_output_info *outinfo)
2916 {
2917         struct radv_shader_output_values *outputs;
2918         unsigned noutput = 0;
2919
2920         if (ctx->options->key.has_multiview_view_index) {
2921                 LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
2922                 if(!*tmp_out) {
2923                         for(unsigned i = 0; i < 4; ++i)
2924                                 ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
2925                                             ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
2926                 }
2927
2928                 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, ctx->abi.view_index),  *tmp_out);
2929                 ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
2930         }
2931
2932         memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
2933                sizeof(outinfo->vs_output_param_offset));
2934         outinfo->pos_exports = 0;
2935
2936         if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) {
2937                 outinfo->writes_pointsize = true;
2938         }
2939
2940         if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) {
2941                 outinfo->writes_layer = true;
2942         }
2943
2944         if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) {
2945                 outinfo->writes_viewport_index = true;
2946         }
2947
2948         if (ctx->shader_info->info.so.num_outputs &&
2949             !ctx->is_gs_copy_shader) {
2950                 /* The GS copy shader emission already emits streamout. */
2951                 radv_emit_streamout(ctx, 0);
2952         }
2953
2954         /* Allocate a temporary array for the output values. */
2955         unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
2956         outputs = malloc(num_outputs * sizeof(outputs[0]));
2957
2958         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2959                 if (!(ctx->output_mask & (1ull << i)))
2960                         continue;
2961
2962                 outputs[noutput].slot_name = i;
2963                 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
2964
2965                 if (ctx->stage == MESA_SHADER_VERTEX &&
2966                     !ctx->is_gs_copy_shader) {
2967                         outputs[noutput].usage_mask =
2968                                 ctx->shader_info->info.vs.output_usage_mask[i];
2969                 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
2970                         outputs[noutput].usage_mask =
2971                                 ctx->shader_info->info.tes.output_usage_mask[i];
2972                 } else {
2973                         assert(ctx->is_gs_copy_shader);
2974                         outputs[noutput].usage_mask =
2975                                 ctx->shader_info->info.gs.output_usage_mask[i];
2976                 }
2977
2978                 for (unsigned j = 0; j < 4; j++) {
2979                         outputs[noutput].values[j] =
2980                                 ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2981                 }
2982
2983                 noutput++;
2984         }
2985
2986         /* Export PrimitiveID. */
2987         if (export_prim_id) {
2988                 outinfo->export_prim_id = true;
2989
2990                 outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
2991                 outputs[noutput].slot_index = 0;
2992                 outputs[noutput].usage_mask = 0x1;
2993                 outputs[noutput].values[0] = ctx->vs_prim_id;
2994                 for (unsigned j = 1; j < 4; j++)
2995                         outputs[noutput].values[j] = ctx->ac.f32_0;
2996                 noutput++;
2997         }
2998
2999         radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
3000
3001         free(outputs);
3002 }
3003
3004 static void
3005 handle_es_outputs_post(struct radv_shader_context *ctx,
3006                        struct radv_es_output_info *outinfo)
3007 {
3008         int j;
3009         uint64_t max_output_written = 0;
3010         LLVMValueRef lds_base = NULL;
3011
3012         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3013                 int param_index;
3014
3015                 if (!(ctx->output_mask & (1ull << i)))
3016                         continue;
3017
3018                 param_index = shader_io_get_unique_index(i);
3019
3020                 max_output_written = MAX2(param_index, max_output_written);
3021         }
3022
3023         outinfo->esgs_itemsize = (max_output_written + 1) * 16;
3024
3025         if (ctx->ac.chip_class  >= GFX9) {
3026                 unsigned itemsize_dw = outinfo->esgs_itemsize / 4;
3027                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3028                 LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
3029                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3030                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
3031                                                       LLVMConstInt(ctx->ac.i32, 64, false), ""), "");
3032                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3033                                         LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
3034         }
3035
3036         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3037                 LLVMValueRef dw_addr = NULL;
3038                 LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
3039                 unsigned output_usage_mask;
3040                 int param_index;
3041
3042                 if (!(ctx->output_mask & (1ull << i)))
3043                         continue;
3044
3045                 if (ctx->stage == MESA_SHADER_VERTEX) {
3046                         output_usage_mask =
3047                                 ctx->shader_info->info.vs.output_usage_mask[i];
3048                 } else {
3049                         assert(ctx->stage == MESA_SHADER_TESS_EVAL);
3050                         output_usage_mask =
3051                                 ctx->shader_info->info.tes.output_usage_mask[i];
3052                 }
3053
3054                 param_index = shader_io_get_unique_index(i);
3055
3056                 if (lds_base) {
3057                         dw_addr = LLVMBuildAdd(ctx->ac.builder, lds_base,
3058                                                LLVMConstInt(ctx->ac.i32, param_index * 4, false),
3059                                                "");
3060                 }
3061
3062                 for (j = 0; j < 4; j++) {
3063                         if (!(output_usage_mask & (1 << j)))
3064                                 continue;
3065
3066                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
3067                         out_val = ac_to_integer(&ctx->ac, out_val);
3068                         out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
3069
3070                         if (ctx->ac.chip_class  >= GFX9) {
3071                                 LLVMValueRef dw_addr_offset =
3072                                         LLVMBuildAdd(ctx->ac.builder, dw_addr,
3073                                                      LLVMConstInt(ctx->ac.i32,
3074                                                                   j, false), "");
3075
3076                                 ac_lds_store(&ctx->ac, dw_addr_offset, out_val);
3077                         } else {
3078                                 ac_build_buffer_store_dword(&ctx->ac,
3079                                                             ctx->esgs_ring,
3080                                                             out_val, 1,
3081                                                             NULL, ctx->es2gs_offset,
3082                                                             (4 * param_index + j) * 4,
3083                                                             ac_glc | ac_slc, true);
3084                         }
3085                 }
3086         }
3087 }
3088
3089 static void
3090 handle_ls_outputs_post(struct radv_shader_context *ctx)
3091 {
3092         LLVMValueRef vertex_id = ctx->rel_auto_id;
3093         uint32_t num_tcs_inputs = util_last_bit64(ctx->shader_info->info.vs.ls_outputs_written);
3094         LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false);
3095         LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3096                                                  vertex_dw_stride, "");
3097
3098         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3099                 LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
3100
3101                 if (!(ctx->output_mask & (1ull << i)))
3102                         continue;
3103
3104                 int param = shader_io_get_unique_index(i);
3105                 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3106                                                     LLVMConstInt(ctx->ac.i32, param * 4, false),
3107                                                     "");
3108                 for (unsigned j = 0; j < 4; j++) {
3109                         LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
3110                         value = ac_to_integer(&ctx->ac, value);
3111                         value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
3112                         ac_lds_store(&ctx->ac, dw_addr, value);
3113                         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, "");
3114                 }
3115         }
3116 }
3117
3118 static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx)
3119 {
3120         return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
3121 }
3122
3123 static LLVMValueRef get_tgsize(struct radv_shader_context *ctx)
3124 {
3125         return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 28, 4);
3126 }
3127
3128 static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx)
3129 {
3130         LLVMBuilderRef builder = ctx->ac.builder;
3131         LLVMValueRef tmp;
3132         tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
3133                            LLVMConstInt(ctx->ac.i32, 64, false), "");
3134         return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
3135 }
3136
3137 static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx)
3138 {
3139         return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
3140                             LLVMConstInt(ctx->ac.i32, 12, false),
3141                             LLVMConstInt(ctx->ac.i32, 9, false),
3142                             false);
3143 }
3144
3145 static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx)
3146 {
3147         return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
3148                             LLVMConstInt(ctx->ac.i32, 22, false),
3149                             LLVMConstInt(ctx->ac.i32, 9, false),
3150                             false);
3151 }
3152
3153 static LLVMValueRef
3154 ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
3155 {
3156         unsigned num_outputs = util_bitcount64(ctx->output_mask);
3157
3158         LLVMTypeRef elements[2] = {
3159                 LLVMArrayType(ctx->ac.i32, 4 * num_outputs),
3160                 LLVMArrayType(ctx->ac.i8, 4),
3161         };
3162         LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
3163         type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
3164         return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
3165 }
3166
3167 /**
3168  * Return a pointer to the LDS storage reserved for the N'th vertex, where N
3169  * is in emit order; that is:
3170  * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
3171  * - during vertex emit, i.e. while the API GS shader invocation is running,
3172  *   N = threadidx * gs_max_out_vertices + emitidx
3173  *
3174  * Goals of the LDS memory layout:
3175  * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
3176  *    in uniform control flow
3177  * 2. Eliminate bank conflicts on read for export if, additionally, there is no
3178  *    culling
3179  * 3. Agnostic to the number of waves (since we don't know it before compiling)
3180  * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
3181  * 5. Avoid wasting memory.
3182  *
3183  * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
3184  * layout, elimination of bank conflicts requires that each vertex occupy an
3185  * odd number of dwords. We use the additional dword to store the output stream
3186  * index as well as a flag to indicate whether this vertex ends a primitive
3187  * for rasterization.
3188  *
3189  * Swizzling is required to satisfy points 1 and 2 simultaneously.
3190  *
3191  * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
3192  * Indices are swizzled in groups of 32, which ensures point 1 without
3193  * disturbing point 2.
3194  *
3195  * \return an LDS pointer to type {[N x i32], [4 x i8]}
3196  */
3197 static LLVMValueRef
3198 ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)
3199 {
3200         LLVMBuilderRef builder = ctx->ac.builder;
3201         LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
3202
3203         /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
3204         unsigned write_stride_2exp = ffs(ctx->gs_max_out_vertices) - 1;
3205         if (write_stride_2exp) {
3206                 LLVMValueRef row =
3207                         LLVMBuildLShr(builder, vertexidx,
3208                                       LLVMConstInt(ctx->ac.i32, 5, false), "");
3209                 LLVMValueRef swizzle =
3210                         LLVMBuildAnd(builder, row,
3211                                      LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
3212                                                   false), "");
3213                 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
3214         }
3215
3216         return ac_build_gep0(&ctx->ac, storage, vertexidx);
3217 }
3218
3219 static LLVMValueRef
3220 ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread,
3221                        LLVMValueRef emitidx)
3222 {
3223         LLVMBuilderRef builder = ctx->ac.builder;
3224         LLVMValueRef tmp;
3225
3226         tmp = LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false);
3227         tmp = LLVMBuildMul(builder, tmp, gsthread, "");
3228         const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
3229         return ngg_gs_vertex_ptr(ctx, vertexidx);
3230 }
3231
3232 /* Send GS Alloc Req message from the first wave of the group to SPI.
3233  * Message payload is:
3234  * - bits 0..10: vertices in group
3235  * - bits 12..22: primitives in group
3236  */
3237 static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx,
3238                                        LLVMValueRef vtx_cnt,
3239                                        LLVMValueRef prim_cnt)
3240 {
3241         LLVMBuilderRef builder = ctx->ac.builder;
3242         LLVMValueRef tmp;
3243
3244         tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
3245         ac_build_ifcc(&ctx->ac, tmp, 5020);
3246
3247         tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
3248         tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
3249         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
3250
3251         ac_build_endif(&ctx->ac, 5020);
3252 }
3253
3254 struct ngg_prim {
3255         unsigned num_vertices;
3256         LLVMValueRef isnull;
3257         LLVMValueRef index[3];
3258         LLVMValueRef edgeflag[3];
3259 };
3260
3261 static void build_export_prim(struct radv_shader_context *ctx,
3262                               const struct ngg_prim *prim)
3263 {
3264         LLVMBuilderRef builder = ctx->ac.builder;
3265         struct ac_export_args args;
3266         LLVMValueRef tmp;
3267
3268         tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
3269         args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
3270
3271         for (unsigned i = 0; i < prim->num_vertices; ++i) {
3272                 tmp = LLVMBuildShl(builder, prim->index[i],
3273                                    LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
3274                 args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
3275                 tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
3276                 tmp = LLVMBuildShl(builder, tmp,
3277                                    LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
3278                 args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
3279         }
3280
3281         args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
3282         args.out[1] = LLVMGetUndef(ctx->ac.f32);
3283         args.out[2] = LLVMGetUndef(ctx->ac.f32);
3284         args.out[3] = LLVMGetUndef(ctx->ac.f32);
3285
3286         args.target = V_008DFC_SQ_EXP_PRIM;
3287         args.enabled_channels = 1;
3288         args.done = true;
3289         args.valid_mask = false;
3290         args.compr = false;
3291
3292         ac_build_export(&ctx->ac, &args);
3293 }
3294
3295 static void
3296 handle_ngg_outputs_post(struct radv_shader_context *ctx)
3297 {
3298         LLVMBuilderRef builder = ctx->ac.builder;
3299         struct ac_build_if_state if_state;
3300         unsigned num_vertices = 3;
3301         LLVMValueRef tmp;
3302
3303         assert((ctx->stage == MESA_SHADER_VERTEX ||
3304                 ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader);
3305
3306         LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8);
3307         LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8);
3308         LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
3309                                                   ac_get_thread_id(&ctx->ac), prims_in_wave, "");
3310         LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
3311                                                   ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
3312         LLVMValueRef vtxindex[] = {
3313                 ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16),
3314                 ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16),
3315                 ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16),
3316         };
3317
3318         /* TODO: streamout */
3319
3320         /* Copy Primitive IDs from GS threads to the LDS address corresponding
3321          * to the ES thread of the provoking vertex.
3322          */
3323         if (ctx->stage == MESA_SHADER_VERTEX &&
3324             ctx->options->key.vs_common_out.export_prim_id) {
3325                 /* TODO: streamout */
3326
3327                 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
3328                 /* Extract the PROVOKING_VTX_INDEX field. */
3329                 LLVMValueRef provoking_vtx_in_prim =
3330                         LLVMConstInt(ctx->ac.i32, 0, false);
3331
3332                 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
3333                 LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
3334                 LLVMValueRef provoking_vtx_index =
3335                         LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
3336
3337                 LLVMBuildStore(builder, ctx->abi.gs_prim_id,
3338                                ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
3339                 ac_build_endif(&ctx->ac, 5400);
3340         }
3341
3342         /* TODO: primitive culling */
3343
3344         build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
3345
3346         /* TODO: streamout queries */
3347         /* Export primitive data to the index buffer. Format is:
3348          *  - bits 0..8: index 0
3349          *  - bit 9: edge flag 0
3350          *  - bits 10..18: index 1
3351          *  - bit 19: edge flag 1
3352          *  - bits 20..28: index 2
3353          *  - bit 29: edge flag 2
3354          *  - bit 31: null primitive (skip)
3355          *
3356          * For the first version, we will always build up all three indices
3357          * independent of the primitive type. The additional garbage data
3358          * shouldn't hurt.
3359          *
3360          * TODO: culling depends on the primitive type, so can have some
3361          * interaction here.
3362          */
3363         ac_nir_build_if(&if_state, ctx, is_gs_thread);
3364         {
3365                 struct ngg_prim prim = {};
3366
3367                 prim.num_vertices = num_vertices;
3368                 prim.isnull = ctx->ac.i1false;
3369                 memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
3370
3371                 for (unsigned i = 0; i < num_vertices; ++i) {
3372                         tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
3373                                             LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
3374                         prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
3375                 }
3376
3377                 build_export_prim(ctx, &prim);
3378         }
3379         ac_nir_build_endif(&if_state);
3380
3381         /* Export per-vertex data (positions and parameters). */
3382         ac_nir_build_if(&if_state, ctx, is_es_thread);
3383         {
3384                 struct radv_vs_output_info *outinfo =
3385                         ctx->stage == MESA_SHADER_TESS_EVAL ? &ctx->shader_info->tes.outinfo : &ctx->shader_info->vs.outinfo;
3386
3387                 /* Exporting the primitive ID is handled below. */
3388                 /* TODO: use the new VS export path */
3389                 handle_vs_outputs_post(ctx, false,
3390                                        ctx->options->key.vs_common_out.export_clip_dists,
3391                                        outinfo);
3392
3393                 if (ctx->options->key.vs_common_out.export_prim_id) {
3394                         unsigned param_count = outinfo->param_exports;
3395                         LLVMValueRef values[4];
3396
3397                         if (ctx->stage == MESA_SHADER_VERTEX) {
3398                                 /* Wait for GS stores to finish. */
3399                                 ac_build_s_barrier(&ctx->ac);
3400
3401                                 tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring,
3402                                                     get_thread_id_in_tg(ctx));
3403                                 values[0] = LLVMBuildLoad(builder, tmp, "");
3404                         } else {
3405                                 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
3406                                 values[0] = ctx->abi.tes_patch_id;
3407                         }
3408
3409                         values[0] = ac_to_float(&ctx->ac, values[0]);
3410                         for (unsigned j = 1; j < 4; j++)
3411                                 values[j] = ctx->ac.f32_0;
3412
3413                         radv_export_param(ctx, param_count, values, 0x1);
3414
3415                         outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++;
3416                         outinfo->export_prim_id = true;
3417                         outinfo->param_exports = param_count;
3418                 }
3419         }
3420         ac_nir_build_endif(&if_state);
3421 }
3422
3423 static void gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
3424 {
3425         /* Zero out the part of LDS scratch that is used to accumulate the
3426          * per-stream generated primitive count.
3427          */
3428         LLVMBuilderRef builder = ctx->ac.builder;
3429         LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
3430         LLVMValueRef tid = get_thread_id_in_tg(ctx);
3431         LLVMBasicBlockRef merge_block;
3432         LLVMValueRef cond;
3433
3434         LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
3435         LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
3436         merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
3437
3438         cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
3439         LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);
3440         LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);
3441
3442         LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
3443         LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
3444
3445         LLVMBuildBr(ctx->ac.builder, merge_block);
3446         LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
3447
3448         ac_build_s_barrier(&ctx->ac);
3449 }
3450
3451 static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
3452 {
3453         LLVMBuilderRef builder = ctx->ac.builder;
3454         LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
3455         LLVMValueRef tmp;
3456
3457         /* Zero out remaining (non-emitted) primitive flags.
3458          *
3459          * Note: Alternatively, we could pass the relevant gs_next_vertex to
3460          *       the emit threads via LDS. This is likely worse in the expected
3461          *       typical case where each GS thread emits the full set of
3462          *       vertices.
3463          */
3464         for (unsigned stream = 0; stream < 4; ++stream) {
3465                 unsigned num_components;
3466
3467                 num_components =
3468                         ctx->shader_info->info.gs.num_stream_output_components[stream];
3469                 if (!num_components)
3470                         continue;
3471
3472                 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
3473
3474                 ac_build_bgnloop(&ctx->ac, 5100);
3475
3476                 const LLVMValueRef vertexidx =
3477                         LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
3478                 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
3479                         LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), "");
3480                 ac_build_ifcc(&ctx->ac, tmp, 5101);
3481                 ac_build_break(&ctx->ac);
3482                 ac_build_endif(&ctx->ac, 5101);
3483
3484                 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
3485                 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
3486
3487                 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
3488                 LLVMValueRef gep_idx[3] = {
3489                         ctx->ac.i32_0, /* implied C-style array */
3490                         ctx->ac.i32_1, /* second entry of struct */
3491                         LLVMConstInt(ctx->ac.i32, stream, false),
3492                 };
3493                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
3494                 LLVMBuildStore(builder, i8_0, tmp);
3495
3496                 ac_build_endloop(&ctx->ac, 5100);
3497         }
3498 }
3499
3500 static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
3501 {
3502         const unsigned verts_per_prim = si_conv_gl_prim_to_vertices(ctx->gs_output_prim);
3503         LLVMBuilderRef builder = ctx->ac.builder;
3504         LLVMValueRef tmp, tmp2;
3505
3506         ac_build_s_barrier(&ctx->ac);
3507
3508         const LLVMValueRef tid = get_thread_id_in_tg(ctx);
3509         LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
3510
3511         /* TODO: streamout */
3512
3513         /* TODO: culling */
3514
3515         /* Determine vertex liveness. */
3516         LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
3517
3518         tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
3519         ac_build_ifcc(&ctx->ac, tmp, 5120);
3520         {
3521                 for (unsigned i = 0; i < verts_per_prim; ++i) {
3522                         const LLVMValueRef primidx =
3523                                 LLVMBuildAdd(builder, tid,
3524                                              LLVMConstInt(ctx->ac.i32, i, false), "");
3525
3526                         if (i > 0) {
3527                                 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
3528                                 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
3529                         }
3530
3531                         /* Load primitive liveness */
3532                         tmp = ngg_gs_vertex_ptr(ctx, primidx);
3533                         LLVMValueRef gep_idx[3] = {
3534                                 ctx->ac.i32_0, /* implicit C-style array */
3535                                 ctx->ac.i32_1, /* second value of struct */
3536                                 ctx->ac.i32_0, /* stream 0 */
3537                         };
3538                         tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
3539                         tmp = LLVMBuildLoad(builder, tmp, "");
3540                         const LLVMValueRef primlive =
3541                                 LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
3542
3543                         tmp = LLVMBuildLoad(builder, vertliveptr, "");
3544                         tmp = LLVMBuildOr(builder, tmp, primlive, ""),
3545                         LLVMBuildStore(builder, tmp, vertliveptr);
3546
3547                         if (i > 0)
3548                                 ac_build_endif(&ctx->ac, 5121 + i);
3549                 }
3550         }
3551         ac_build_endif(&ctx->ac, 5120);
3552
3553         /* Inclusive scan addition across the current wave. */
3554         LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
3555         struct ac_wg_scan vertlive_scan = {};
3556         vertlive_scan.op = nir_op_iadd;
3557         vertlive_scan.enable_reduce = true;
3558         vertlive_scan.enable_exclusive = true;
3559         vertlive_scan.src = vertlive;
3560         vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
3561         vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
3562         vertlive_scan.numwaves = get_tgsize(ctx);
3563         vertlive_scan.maxwaves = 8;
3564
3565         ac_build_wg_scan(&ctx->ac, &vertlive_scan);
3566
3567         /* Skip all exports (including index exports) when possible. At least on
3568          * early gfx10 revisions this is also to avoid hangs.
3569          */
3570         LLVMValueRef have_exports =
3571                 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
3572         num_emit_threads =
3573                 LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
3574
3575         /* Allocate export space. Send this message as early as possible, to
3576          * hide the latency of the SQ <-> SPI roundtrip.
3577          *
3578          * Note: We could consider compacting primitives for export as well.
3579          *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
3580          *       prim data per clock and skips null primitives at no additional
3581          *       cost. So compacting primitives can only be beneficial when
3582          *       there are 4 or more contiguous null primitives in the export
3583          *       (in the common case of single-dword prim exports).
3584          */
3585         build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads);
3586
3587         /* Setup the reverse vertex compaction permutation. We re-use stream 1
3588          * of the primitive liveness flags, relying on the fact that each
3589          * threadgroup can have at most 256 threads. */
3590         ac_build_ifcc(&ctx->ac, vertlive, 5130);
3591         {
3592                 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
3593                 LLVMValueRef gep_idx[3] = {
3594                         ctx->ac.i32_0, /* implicit C-style array */
3595                         ctx->ac.i32_1, /* second value of struct */
3596                         ctx->ac.i32_1, /* stream 1 */
3597                 };
3598                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
3599                 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
3600                 LLVMBuildStore(builder, tmp2, tmp);
3601         }
3602         ac_build_endif(&ctx->ac, 5130);
3603
3604         ac_build_s_barrier(&ctx->ac);
3605
3606         /* Export primitive data */
3607         tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
3608         ac_build_ifcc(&ctx->ac, tmp, 5140);
3609         {
3610                 struct ngg_prim prim = {};
3611                 prim.num_vertices = verts_per_prim;
3612
3613                 tmp = ngg_gs_vertex_ptr(ctx, tid);
3614                 LLVMValueRef gep_idx[3] = {
3615                         ctx->ac.i32_0, /* implicit C-style array */
3616                         ctx->ac.i32_1, /* second value of struct */
3617                         ctx->ac.i32_0, /* primflag */
3618                 };
3619                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
3620                 tmp = LLVMBuildLoad(builder, tmp, "");
3621                 prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp,
3622                                             LLVMConstInt(ctx->ac.i8, 0, false), "");
3623
3624                 for (unsigned i = 0; i < verts_per_prim; ++i) {
3625                         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
3626                                 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
3627                         prim.edgeflag[i] = ctx->ac.i1false;
3628                 }
3629
3630                 build_export_prim(ctx, &prim);
3631         }
3632         ac_build_endif(&ctx->ac, 5140);
3633
3634         /* Export position and parameter data */
3635         tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
3636         ac_build_ifcc(&ctx->ac, tmp, 5145);
3637         {
3638                 struct radv_vs_output_info *outinfo = &ctx->shader_info->vs.outinfo;
3639                 bool export_view_index = ctx->options->key.has_multiview_view_index;
3640                 struct radv_shader_output_values *outputs;
3641                 unsigned noutput = 0;
3642
3643                 /* Allocate a temporary array for the output values. */
3644                 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
3645                 outputs = calloc(num_outputs, sizeof(outputs[0]));
3646
3647                 memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
3648                        sizeof(outinfo->vs_output_param_offset));
3649                 outinfo->pos_exports = 0;
3650
3651                 tmp = ngg_gs_vertex_ptr(ctx, tid);
3652                 LLVMValueRef gep_idx[3] = {
3653                         ctx->ac.i32_0, /* implicit C-style array */
3654                         ctx->ac.i32_1, /* second value of struct */
3655                         ctx->ac.i32_1, /* stream 1: source data index */
3656                 };
3657                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
3658                 tmp = LLVMBuildLoad(builder, tmp, "");
3659                 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
3660                 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
3661
3662                 if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) {
3663                         outinfo->writes_pointsize = true;
3664                 }
3665
3666                 if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) {
3667                         outinfo->writes_layer = true;
3668                 }
3669
3670                 if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) {
3671                         outinfo->writes_viewport_index = true;
3672                 }
3673
3674                 unsigned out_idx = 0;
3675                 gep_idx[1] = ctx->ac.i32_0;
3676                 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3677                         if (!(ctx->output_mask & (1ull << i)))
3678                                 continue;
3679
3680                         outputs[noutput].slot_name = i;
3681                         outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
3682
3683                         outputs[noutput].usage_mask = ctx->shader_info->info.gs.output_usage_mask[i];
3684                         int length = util_last_bit(outputs[noutput].usage_mask);
3685
3686                         for (unsigned j = 0; j < length; j++, out_idx++) {
3687                                 gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false);
3688                                 tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
3689                                 tmp = LLVMBuildLoad(builder, tmp, "");
3690
3691                                 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
3692                                 if (ac_get_type_size(type) == 2) {
3693                                         tmp = ac_to_integer(&ctx->ac, tmp);
3694                                         tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");
3695                                 }
3696
3697                                 outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);
3698                         }
3699
3700                         for (unsigned j = length; j < 4; j++)
3701                                 outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
3702
3703                         noutput++;
3704                 }
3705
3706                 /* Export ViewIndex. */
3707                 if (export_view_index) {
3708                         outinfo->writes_layer = true;
3709
3710                         outputs[noutput].slot_name = VARYING_SLOT_LAYER;
3711                         outputs[noutput].slot_index = 0;
3712                         outputs[noutput].usage_mask = 0x1;
3713                         outputs[noutput].values[0] = ac_to_float(&ctx->ac, ctx->abi.view_index);
3714                         for (unsigned j = 1; j < 4; j++)
3715                                 outputs[noutput].values[j] = ctx->ac.f32_0;
3716                         noutput++;
3717                 }
3718
3719                 radv_llvm_export_vs(ctx, outputs, noutput, outinfo,
3720                                     ctx->options->key.vs_common_out.export_clip_dists);
3721                 FREE(outputs);
3722         }
3723         ac_build_endif(&ctx->ac, 5145);
3724 }
3725
3726 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx,
3727                                      unsigned stream,
3728                                      LLVMValueRef *addrs)
3729 {
3730         LLVMBuilderRef builder = ctx->ac.builder;
3731         LLVMValueRef tmp;
3732         const LLVMValueRef vertexidx =
3733                 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
3734
3735         /* If this thread has already emitted the declared maximum number of
3736          * vertices, skip the write: excessive vertex emissions are not
3737          * supposed to have any effect.
3738          */
3739         const LLVMValueRef can_emit =
3740                 LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
3741                               LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), "");
3742         ac_build_kill_if_false(&ctx->ac, can_emit);
3743
3744         tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
3745         tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
3746         LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
3747
3748         const LLVMValueRef vertexptr =
3749                 ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
3750         unsigned out_idx = 0;
3751         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3752                 unsigned output_usage_mask =
3753                         ctx->shader_info->info.gs.output_usage_mask[i];
3754                 uint8_t output_stream =
3755                         ctx->shader_info->info.gs.output_streams[i];
3756                 LLVMValueRef *out_ptr = &addrs[i * 4];
3757                 int length = util_last_bit(output_usage_mask);
3758
3759                 if (!(ctx->output_mask & (1ull << i)) ||
3760                     output_stream != stream)
3761                         continue;
3762
3763                 for (unsigned j = 0; j < length; j++, out_idx++) {
3764                         if (!(output_usage_mask & (1 << j)))
3765                                 continue;
3766
3767                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
3768                                                              out_ptr[j], "");
3769                         LLVMValueRef gep_idx[3] = {
3770                                 ctx->ac.i32_0, /* implied C-style array */
3771                                 ctx->ac.i32_0, /* first entry of struct */
3772                                 LLVMConstInt(ctx->ac.i32, out_idx, false),
3773                         };
3774                         LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
3775
3776                         out_val = ac_to_integer(&ctx->ac, out_val);
3777                         out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
3778
3779                         LLVMBuildStore(builder, out_val, ptr);
3780                 }
3781         }
3782         assert(out_idx * 4 <= ctx->gsvs_vertex_size);
3783
3784         /* Determine and store whether this vertex completed a primitive. */
3785         const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
3786
3787         tmp = LLVMConstInt(ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->gs_output_prim) - 1, false);
3788         const LLVMValueRef iscompleteprim =
3789                 LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
3790
3791         tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
3792         LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
3793
3794         LLVMValueRef gep_idx[3] = {
3795                 ctx->ac.i32_0, /* implied C-style array */
3796                 ctx->ac.i32_1, /* second struct entry */
3797                 LLVMConstInt(ctx->ac.i32, stream, false),
3798         };
3799         const LLVMValueRef primflagptr =
3800                 LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
3801
3802         tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
3803         LLVMBuildStore(builder, tmp, primflagptr);
3804
3805         tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
3806         tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
3807         LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
3808 }
3809
3810 static void
3811 write_tess_factors(struct radv_shader_context *ctx)
3812 {
3813         unsigned stride, outer_comps, inner_comps;
3814         struct ac_build_if_state if_ctx, inner_if_ctx;
3815         LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5);
3816         LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
3817         unsigned tess_inner_index = 0, tess_outer_index;
3818         LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer;
3819         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
3820         int i;
3821         ac_emit_barrier(&ctx->ac, ctx->stage);
3822
3823         switch (ctx->options->key.tcs.primitive_mode) {
3824         case GL_ISOLINES:
3825                 stride = 2;
3826                 outer_comps = 2;
3827                 inner_comps = 0;
3828                 break;
3829         case GL_TRIANGLES:
3830                 stride = 4;
3831                 outer_comps = 3;
3832                 inner_comps = 1;
3833                 break;
3834         case GL_QUADS:
3835                 stride = 6;
3836                 outer_comps = 4;
3837                 inner_comps = 2;
3838                 break;
3839         default:
3840                 return;
3841         }
3842
3843         ac_nir_build_if(&if_ctx, ctx,
3844                         LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3845                                       invocation_id, ctx->ac.i32_0, ""));
3846
3847         lds_base = get_tcs_out_current_patch_data_offset(ctx);
3848
3849         if (inner_comps) {
3850                 tess_inner_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
3851                 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
3852                                          LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, false), "");
3853         }
3854
3855         tess_outer_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
3856         lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
3857                                  LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, false), "");
3858
3859         for (i = 0; i < 4; i++) {
3860                 inner[i] = LLVMGetUndef(ctx->ac.i32);
3861                 outer[i] = LLVMGetUndef(ctx->ac.i32);
3862         }
3863
3864         // LINES reversal
3865         if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) {
3866                 outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer);
3867                 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer,
3868                                          ctx->ac.i32_1, "");
3869                 outer[1] = out[0] = ac_lds_load(&ctx->ac, lds_outer);
3870         } else {
3871                 for (i = 0; i < outer_comps; i++) {
3872                         outer[i] = out[i] =
3873                                 ac_lds_load(&ctx->ac, lds_outer);
3874                         lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer,
3875                                                  ctx->ac.i32_1, "");
3876                 }
3877                 for (i = 0; i < inner_comps; i++) {
3878                         inner[i] = out[outer_comps+i] =
3879                                 ac_lds_load(&ctx->ac, lds_inner);
3880                         lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_inner,
3881                                                  ctx->ac.i32_1, "");
3882                 }
3883         }
3884
3885         /* Convert the outputs to vectors for stores. */
3886         vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
3887         vec1 = NULL;
3888
3889         if (stride > 4)
3890                 vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
3891
3892
3893         buffer = ctx->hs_ring_tess_factor;
3894         tf_base = ctx->tess_factor_offset;
3895         byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
3896                                   LLVMConstInt(ctx->ac.i32, 4 * stride, false), "");
3897         unsigned tf_offset = 0;
3898
3899         if (ctx->options->chip_class <= GFX8) {
3900                 ac_nir_build_if(&inner_if_ctx, ctx,
3901                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3902                                               rel_patch_id, ctx->ac.i32_0, ""));
3903
3904                 /* Store the dynamic HS control word. */
3905                 ac_build_buffer_store_dword(&ctx->ac, buffer,
3906                                             LLVMConstInt(ctx->ac.i32, 0x80000000, false),
3907                                             1, ctx->ac.i32_0, tf_base,
3908                                             0, ac_glc, false);
3909                 tf_offset += 4;
3910
3911                 ac_nir_build_endif(&inner_if_ctx);
3912         }
3913
3914         /* Store the tessellation factors. */
3915         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
3916                                     MIN2(stride, 4), byteoffset, tf_base,
3917                                     tf_offset, ac_glc, false);
3918         if (vec1)
3919                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
3920                                             stride - 4, byteoffset, tf_base,
3921                                             16 + tf_offset, ac_glc, false);
3922
3923         //store to offchip for TES to read - only if TES reads them
3924         if (ctx->options->key.tcs.tes_reads_tess_factors) {
3925                 LLVMValueRef inner_vec, outer_vec, tf_outer_offset;
3926                 LLVMValueRef tf_inner_offset;
3927                 unsigned param_outer, param_inner;
3928
3929                 param_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
3930                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, NULL,
3931                                                              LLVMConstInt(ctx->ac.i32, param_outer, 0));
3932
3933                 outer_vec = ac_build_gather_values(&ctx->ac, outer,
3934                                                    util_next_power_of_two(outer_comps));
3935
3936                 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec,
3937                                             outer_comps, tf_outer_offset,
3938                                             ctx->oc_lds, 0, ac_glc, false);
3939                 if (inner_comps) {
3940                         param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
3941                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL,
3942                                                                      LLVMConstInt(ctx->ac.i32, param_inner, 0));
3943
3944                         inner_vec = inner_comps == 1 ? inner[0] :
3945                                 ac_build_gather_values(&ctx->ac, inner, inner_comps);
3946                         ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec,
3947                                                     inner_comps, tf_inner_offset,
3948                                                     ctx->oc_lds, 0, ac_glc, false);
3949                 }
3950         }
3951         ac_nir_build_endif(&if_ctx);
3952 }
3953
3954 static void
3955 handle_tcs_outputs_post(struct radv_shader_context *ctx)
3956 {
3957         write_tess_factors(ctx);
3958 }
3959
3960 static bool
3961 si_export_mrt_color(struct radv_shader_context *ctx,
3962                     LLVMValueRef *color, unsigned index,
3963                     struct ac_export_args *args)
3964 {
3965         /* Export */
3966         si_llvm_init_export_args(ctx, color, 0xf,
3967                                  V_008DFC_SQ_EXP_MRT + index, args);
3968         if (!args->enabled_channels)
3969                 return false; /* unnecessary NULL export */
3970
3971         return true;
3972 }
3973
3974 static void
3975 radv_export_mrt_z(struct radv_shader_context *ctx,
3976                   LLVMValueRef depth, LLVMValueRef stencil,
3977                   LLVMValueRef samplemask)
3978 {
3979         struct ac_export_args args;
3980
3981         ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
3982
3983         ac_build_export(&ctx->ac, &args);
3984 }
3985
3986 static void
3987 handle_fs_outputs_post(struct radv_shader_context *ctx)
3988 {
3989         unsigned index = 0;
3990         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3991         struct ac_export_args color_args[8];
3992
3993         for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
3994                 LLVMValueRef values[4];
3995
3996                 if (!(ctx->output_mask & (1ull << i)))
3997                         continue;
3998
3999                 if (i < FRAG_RESULT_DATA0)
4000                         continue;
4001
4002                 for (unsigned j = 0; j < 4; j++)
4003                         values[j] = ac_to_float(&ctx->ac,
4004                                                 radv_load_output(ctx, i, j));
4005
4006                 bool ret = si_export_mrt_color(ctx, values,
4007                                                i - FRAG_RESULT_DATA0,
4008                                                &color_args[index]);
4009                 if (ret)
4010                         index++;
4011         }
4012
4013         /* Process depth, stencil, samplemask. */
4014         if (ctx->shader_info->info.ps.writes_z) {
4015                 depth = ac_to_float(&ctx->ac,
4016                                     radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
4017         }
4018         if (ctx->shader_info->info.ps.writes_stencil) {
4019                 stencil = ac_to_float(&ctx->ac,
4020                                       radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
4021         }
4022         if (ctx->shader_info->info.ps.writes_sample_mask) {
4023                 samplemask = ac_to_float(&ctx->ac,
4024                                          radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
4025         }
4026
4027         /* Set the DONE bit on last non-null color export only if Z isn't
4028          * exported.
4029          */
4030         if (index > 0 &&
4031             !ctx->shader_info->info.ps.writes_z &&
4032             !ctx->shader_info->info.ps.writes_stencil &&
4033             !ctx->shader_info->info.ps.writes_sample_mask) {
4034                 unsigned last = index - 1;
4035
4036                color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
4037                color_args[last].done = 1; /* DONE bit */
4038         }
4039
4040         /* Export PS outputs. */
4041         for (unsigned i = 0; i < index; i++)
4042                 ac_build_export(&ctx->ac, &color_args[i]);
4043
4044         if (depth || stencil || samplemask)
4045                 radv_export_mrt_z(ctx, depth, stencil, samplemask);
4046         else if (!index)
4047                 ac_build_export_null(&ctx->ac);
4048 }
4049
4050 static void
4051 emit_gs_epilogue(struct radv_shader_context *ctx)
4052 {
4053         if (ctx->options->key.vs_common_out.as_ngg) {
4054                 gfx10_ngg_gs_emit_epilogue_1(ctx);
4055                 return;
4056         }
4057
4058         if (ctx->ac.chip_class >= GFX10)
4059                 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
4060
4061         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
4062 }
4063
4064 static void
4065 handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
4066                            LLVMValueRef *addrs)
4067 {
4068         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
4069
4070         switch (ctx->stage) {
4071         case MESA_SHADER_VERTEX:
4072                 if (ctx->options->key.vs_common_out.as_ls)
4073                         handle_ls_outputs_post(ctx);
4074                 else if (ctx->options->key.vs_common_out.as_es)
4075                         handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
4076                 else if (ctx->options->key.vs_common_out.as_ngg)
4077                         break; /* handled outside of the shader body */
4078                 else
4079                         handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id,
4080                                                ctx->options->key.vs_common_out.export_clip_dists,
4081                                                &ctx->shader_info->vs.outinfo);
4082                 break;
4083         case MESA_SHADER_FRAGMENT:
4084                 handle_fs_outputs_post(ctx);
4085                 break;
4086         case MESA_SHADER_GEOMETRY:
4087                 emit_gs_epilogue(ctx);
4088                 break;
4089         case MESA_SHADER_TESS_CTRL:
4090                 handle_tcs_outputs_post(ctx);
4091                 break;
4092         case MESA_SHADER_TESS_EVAL:
4093                 if (ctx->options->key.vs_common_out.as_es)
4094                         handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info);
4095                 else if (ctx->options->key.vs_common_out.as_ngg)
4096                         break; /* handled outside of the shader body */
4097                 else
4098                         handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id,
4099                                                ctx->options->key.vs_common_out.export_clip_dists,
4100                                                &ctx->shader_info->tes.outinfo);
4101                 break;
4102         default:
4103                 break;
4104         }
4105 }
4106
4107 static void ac_llvm_finalize_module(struct radv_shader_context *ctx,
4108                                     LLVMPassManagerRef passmgr,
4109                                     const struct radv_nir_compiler_options *options)
4110 {
4111         LLVMRunPassManager(passmgr, ctx->ac.module);
4112         LLVMDisposeBuilder(ctx->ac.builder);
4113
4114         ac_llvm_context_dispose(&ctx->ac);
4115 }
4116
4117 static void
4118 ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
4119 {
4120         struct radv_vs_output_info *outinfo;
4121
4122         switch (ctx->stage) {
4123         case MESA_SHADER_FRAGMENT:
4124         case MESA_SHADER_COMPUTE:
4125         case MESA_SHADER_TESS_CTRL:
4126         case MESA_SHADER_GEOMETRY:
4127                 return;
4128         case MESA_SHADER_VERTEX:
4129                 if (ctx->options->key.vs_common_out.as_ls ||
4130                     ctx->options->key.vs_common_out.as_es)
4131                         return;
4132                 outinfo = &ctx->shader_info->vs.outinfo;
4133                 break;
4134         case MESA_SHADER_TESS_EVAL:
4135                 if (ctx->options->key.vs_common_out.as_es)
4136                         return;
4137                 outinfo = &ctx->shader_info->tes.outinfo;
4138                 break;
4139         default:
4140                 unreachable("Unhandled shader type");
4141         }
4142
4143         ac_optimize_vs_outputs(&ctx->ac,
4144                                ctx->main_function,
4145                                outinfo->vs_output_param_offset,
4146                                VARYING_SLOT_MAX,
4147                                &outinfo->param_exports);
4148 }
4149
4150 static void
4151 ac_setup_rings(struct radv_shader_context *ctx)
4152 {
4153         if (ctx->options->chip_class <= GFX8 &&
4154             (ctx->stage == MESA_SHADER_GEOMETRY ||
4155              ctx->options->key.vs_common_out.as_es || ctx->options->key.vs_common_out.as_es)) {
4156                 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS
4157                                                                    : RING_ESGS_VS;
4158                 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
4159
4160                 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac,
4161                                                        ctx->ring_offsets,
4162                                                        offset);
4163         }
4164
4165         if (ctx->is_gs_copy_shader) {
4166                 ctx->gsvs_ring[0] =
4167                         ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
4168                                               LLVMConstInt(ctx->ac.i32,
4169                                                            RING_GSVS_VS, false));
4170         }
4171
4172         if (ctx->stage == MESA_SHADER_GEOMETRY) {
4173                 /* The conceptual layout of the GSVS ring is
4174                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
4175                  * but the real memory layout is swizzled across
4176                  * threads:
4177                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4178                  *   t16v0c0 ..
4179                  * Override the buffer descriptor accordingly.
4180                  */
4181                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
4182                 uint64_t stream_offset = 0;
4183                 unsigned num_records = 64;
4184                 LLVMValueRef base_ring;
4185
4186                 base_ring =
4187                         ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
4188                                               LLVMConstInt(ctx->ac.i32,
4189                                                            RING_GSVS_GS, false));
4190
4191                 for (unsigned stream = 0; stream < 4; stream++) {
4192                         unsigned num_components, stride;
4193                         LLVMValueRef ring, tmp;
4194
4195                         num_components =
4196                                 ctx->shader_info->info.gs.num_stream_output_components[stream];
4197
4198                         if (!num_components)
4199                                 continue;
4200
4201                         stride = 4 * num_components * ctx->gs_max_out_vertices;
4202
4203                         /* Limit on the stride field for <= GFX7. */
4204                         assert(stride < (1 << 14));
4205
4206                         ring = LLVMBuildBitCast(ctx->ac.builder,
4207                                                 base_ring, v2i64, "");
4208                         tmp = LLVMBuildExtractElement(ctx->ac.builder,
4209                                                       ring, ctx->ac.i32_0, "");
4210                         tmp = LLVMBuildAdd(ctx->ac.builder, tmp,
4211                                            LLVMConstInt(ctx->ac.i64,
4212                                                         stream_offset, 0), "");
4213                         ring = LLVMBuildInsertElement(ctx->ac.builder,
4214                                                       ring, tmp, ctx->ac.i32_0, "");
4215
4216                         stream_offset += stride * 64;
4217
4218                         ring = LLVMBuildBitCast(ctx->ac.builder, ring,
4219                                                 ctx->ac.v4i32, "");
4220
4221                         tmp = LLVMBuildExtractElement(ctx->ac.builder, ring,
4222                                                       ctx->ac.i32_1, "");
4223                         tmp = LLVMBuildOr(ctx->ac.builder, tmp,
4224                                           LLVMConstInt(ctx->ac.i32,
4225                                                        S_008F04_STRIDE(stride), false), "");
4226                         ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp,
4227                                                       ctx->ac.i32_1, "");
4228
4229                         ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
4230                                                       LLVMConstInt(ctx->ac.i32,
4231                                                                    num_records, false),
4232                                                       LLVMConstInt(ctx->ac.i32, 2, false), "");
4233
4234                         ctx->gsvs_ring[stream] = ring;
4235                 }
4236         }
4237
4238         if (ctx->stage == MESA_SHADER_TESS_CTRL ||
4239             ctx->stage == MESA_SHADER_TESS_EVAL) {
4240                 ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
4241                 ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
4242         }
4243 }
4244
4245 unsigned
4246 radv_nir_get_max_workgroup_size(enum chip_class chip_class,
4247                                 gl_shader_stage stage,
4248                                 const struct nir_shader *nir)
4249 {
4250         switch (stage) {
4251         case MESA_SHADER_TESS_CTRL:
4252                 return chip_class >= GFX7 ? 128 : 64;
4253         case MESA_SHADER_GEOMETRY:
4254                 return chip_class >= GFX9 ? 128 : 64;
4255         case MESA_SHADER_COMPUTE:
4256                 break;
4257         default:
4258                 return 0;
4259         }
4260
4261         if (!nir)
4262                 return chip_class >= GFX9 ? 128 : 64;
4263         unsigned max_workgroup_size = nir->info.cs.local_size[0] *
4264                 nir->info.cs.local_size[1] *
4265                 nir->info.cs.local_size[2];
4266         return max_workgroup_size;
4267 }
4268
4269 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
4270 static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
4271 {
4272         LLVMValueRef count = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8);
4273         LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count,
4274                                               ctx->ac.i32_0, "");
4275         ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, "");
4276         ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, "");
4277         ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, "");
4278 }
4279
4280 static void prepare_gs_input_vgprs(struct radv_shader_context *ctx)
4281 {
4282         for(int i = 5; i >= 0; --i) {
4283                 ctx->gs_vtx_offset[i] = ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[i & ~1],
4284                                                         (i & 1) * 16, 16);
4285         }
4286
4287         ctx->gs_wave_id = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 16, 8);
4288 }
4289
4290 /* Ensure that the esgs ring is declared.
4291  *
4292  * We declare it with 64KB alignment as a hint that the
4293  * pointer value will always be 0.
4294  */
4295 static void declare_esgs_ring(struct radv_shader_context *ctx)
4296 {
4297         if (ctx->esgs_ring)
4298                 return;
4299
4300         assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
4301
4302         ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
4303                 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
4304                 "esgs_ring",
4305                 AC_ADDR_SPACE_LDS);
4306         LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
4307         LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
4308 }
4309
4310 static
4311 LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
4312                                        struct nir_shader *const *shaders,
4313                                        int shader_count,
4314                                        struct radv_shader_variant_info *shader_info,
4315                                        const struct radv_nir_compiler_options *options)
4316 {
4317         struct radv_shader_context ctx = {0};
4318         unsigned i;
4319         ctx.options = options;
4320         ctx.shader_info = shader_info;
4321
4322         enum ac_float_mode float_mode =
4323                 options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
4324                                        AC_FLOAT_MODE_DEFAULT;
4325
4326         ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
4327                              options->family, float_mode, 64);
4328         ctx.context = ctx.ac.context;
4329
4330         radv_nir_shader_info_init(&shader_info->info);
4331
4332         for(int i = 0; i < shader_count; ++i)
4333                 radv_nir_shader_info_pass(shaders[i], options, &shader_info->info);
4334
4335         for (i = 0; i < RADV_UD_MAX_SETS; i++)
4336                 shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
4337         for (i = 0; i < AC_UD_MAX_UD; i++)
4338                 shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
4339
4340         ctx.max_workgroup_size = 0;
4341         for (int i = 0; i < shader_count; ++i) {
4342                 ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size,
4343                                               radv_nir_get_max_workgroup_size(ctx.options->chip_class,
4344                                                                               shaders[i]->info.stage,
4345                                                                               shaders[i]));
4346         }
4347
4348         if (ctx.ac.chip_class >= GFX10) {
4349                 if (is_pre_gs_stage(shaders[0]->info.stage) &&
4350                     options->key.vs_common_out.as_ngg) {
4351                         ctx.max_workgroup_size = 128;
4352                 }
4353         }
4354
4355         create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
4356                         shader_count >= 2 ? shaders[shader_count - 2]->info.stage  : MESA_SHADER_VERTEX);
4357
4358         ctx.abi.inputs = &ctx.inputs[0];
4359         ctx.abi.emit_outputs = handle_shader_outputs_post;
4360         ctx.abi.emit_vertex = visit_emit_vertex;
4361         ctx.abi.load_ubo = radv_load_ubo;
4362         ctx.abi.load_ssbo = radv_load_ssbo;
4363         ctx.abi.load_sampler_desc = radv_get_sampler_desc;
4364         ctx.abi.load_resource = radv_load_resource;
4365         ctx.abi.clamp_shadow_reference = false;
4366         ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800;
4367
4368         /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8,
4369          * we fallback to the old intrinsics for atomic buffer image operations
4370          * and thus we need to apply the indexing workaround...
4371          */
4372         ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
4373
4374         bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) &&  ctx.options->key.vs_common_out.as_ngg;
4375         if (shader_count >= 2 || is_ngg)
4376                 ac_init_exec_full_mask(&ctx.ac);
4377
4378         if ((ctx.ac.family == CHIP_VEGA10 ||
4379              ctx.ac.family == CHIP_RAVEN) &&
4380             shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
4381                 ac_nir_fixup_ls_hs_input_vgprs(&ctx);
4382
4383         for(int i = 0; i < shader_count; ++i) {
4384                 ctx.stage = shaders[i]->info.stage;
4385                 ctx.output_mask = 0;
4386
4387                 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
4388                         for (int i = 0; i < 4; i++) {
4389                                 ctx.gs_next_vertex[i] =
4390                                         ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
4391                         }
4392                         if (ctx.options->key.vs_common_out.as_ngg) {
4393                                 for (unsigned i = 0; i < 4; ++i) {
4394                                         ctx.gs_curprim_verts[i] =
4395                                                 ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
4396                                         ctx.gs_generated_prims[i] =
4397                                                 ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
4398                                 }
4399
4400                                 /* TODO: streamout */
4401
4402                                 LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
4403                                 ctx.gs_ngg_scratch =
4404                                         LLVMAddGlobalInAddressSpace(ctx.ac.module,
4405                                                                     ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
4406                                 LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
4407                                 LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
4408
4409                                 ctx.gs_ngg_emit = LLVMBuildIntToPtr(ctx.ac.builder, ctx.ac.i32_0,
4410                                         LLVMPointerType(LLVMArrayType(ctx.ac.i32, 0), AC_ADDR_SPACE_LDS),
4411                                         "ngg_emit");
4412                         }
4413
4414                         ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out;
4415                         ctx.gs_output_prim = shaders[i]->info.gs.output_primitive;
4416                         ctx.abi.load_inputs = load_gs_input;
4417                         ctx.abi.emit_primitive = visit_end_primitive;
4418                 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
4419                         ctx.tcs_outputs_read = shaders[i]->info.outputs_read;
4420                         ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read;
4421                         ctx.abi.load_tess_varyings = load_tcs_varyings;
4422                         ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
4423                         ctx.abi.store_tcs_outputs = store_tcs_output;
4424                         ctx.tcs_vertices_per_patch = shaders[i]->info.tess.tcs_vertices_out;
4425                         if (shader_count == 1)
4426                                 ctx.tcs_num_inputs = ctx.options->key.tcs.num_inputs;
4427                         else
4428                                 ctx.tcs_num_inputs = util_last_bit64(shader_info->info.vs.ls_outputs_written);
4429                         ctx.tcs_num_patches = get_tcs_num_patches(&ctx);
4430                 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
4431                         ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode;
4432                         ctx.abi.load_tess_varyings = load_tes_input;
4433                         ctx.abi.load_tess_coord = load_tess_coord;
4434                         ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
4435                         ctx.tcs_vertices_per_patch = shaders[i]->info.tess.tcs_vertices_out;
4436                         ctx.tcs_num_patches = ctx.options->key.tes.num_patches;
4437                 } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) {
4438                         ctx.abi.load_base_vertex = radv_load_base_vertex;
4439                 } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
4440                         shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
4441                         ctx.abi.lookup_interp_param = lookup_interp_param;
4442                         ctx.abi.load_sample_position = load_sample_position;
4443                         ctx.abi.load_sample_mask_in = load_sample_mask_in;
4444                         ctx.abi.emit_kill = radv_emit_kill;
4445                 }
4446
4447                 if (shaders[i]->info.stage == MESA_SHADER_VERTEX &&
4448                     ctx.options->key.vs_common_out.as_ngg &&
4449                     ctx.options->key.vs_common_out.export_prim_id) {
4450                         declare_esgs_ring(&ctx);
4451                 }
4452
4453                 bool nested_barrier = false;
4454
4455                 if (i) {
4456                         if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
4457                             ctx.options->key.vs_common_out.as_ngg) {
4458                                 nested_barrier = false;
4459                         } else {
4460                                 nested_barrier = true;
4461                         }
4462                 }
4463
4464                 if (nested_barrier) {
4465                         /* Execute a barrier before the second shader in
4466                          * a merged shader.
4467                          *
4468                          * Execute the barrier inside the conditional block,
4469                          * so that empty waves can jump directly to s_endpgm,
4470                          * which will also signal the barrier.
4471                          *
4472                          * This is possible in gfx9, because an empty wave
4473                          * for the second shader does not participate in
4474                          * the epilogue. With NGG, empty waves may still
4475                          * be required to export data (e.g. GS output vertices),
4476                          * so we cannot let them exit early.
4477                          *
4478                          * If the shader is TCS and the TCS epilog is present
4479                          * and contains a barrier, it will wait there and then
4480                          * reach s_endpgm.
4481                         */
4482                         ac_emit_barrier(&ctx.ac, ctx.stage);
4483                 }
4484
4485                 nir_foreach_variable(variable, &shaders[i]->outputs)
4486                         scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
4487
4488                 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
4489                         unsigned addclip = shaders[i]->info.clip_distance_array_size +
4490                                         shaders[i]->info.cull_distance_array_size > 4;
4491                         ctx.gsvs_vertex_size = (util_bitcount64(ctx.output_mask) + addclip) * 16;
4492                         ctx.max_gsvs_emit_size = ctx.gsvs_vertex_size *
4493                                 shaders[i]->info.gs.vertices_out;
4494                 }
4495
4496                 ac_setup_rings(&ctx);
4497
4498                 LLVMBasicBlockRef merge_block;
4499                 if (shader_count >= 2 || is_ngg) {
4500
4501                         if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
4502                             ctx.options->key.vs_common_out.as_ngg) {
4503                                 gfx10_ngg_gs_emit_prologue(&ctx);
4504                         }
4505
4506                         LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
4507                         LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
4508                         merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
4509
4510                         LLVMValueRef count = ac_unpack_param(&ctx.ac, ctx.merged_wave_info, 8 * i, 8);
4511                         LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
4512                         LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT,
4513                                                           thread_id, count, "");
4514                         LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
4515
4516                         LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
4517                 }
4518
4519                 if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT)
4520                         prepare_interp_optimize(&ctx, shaders[i]);
4521                 else if(shaders[i]->info.stage == MESA_SHADER_VERTEX)
4522                         handle_vs_inputs(&ctx, shaders[i]);
4523                 else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY)
4524                         prepare_gs_input_vgprs(&ctx);
4525
4526                 ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]);
4527
4528                 if (shader_count >= 2 || is_ngg) {
4529                         LLVMBuildBr(ctx.ac.builder, merge_block);
4530                         LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
4531                 }
4532
4533                 /* This needs to be outside the if wrapping the shader body, as sometimes
4534                  * the HW generates waves with 0 es/vs threads. */
4535                 if (is_pre_gs_stage(shaders[i]->info.stage) &&
4536                     ctx.options->key.vs_common_out.as_ngg &&
4537                     i == shader_count - 1) {
4538                         handle_ngg_outputs_post(&ctx);
4539                 } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY &&
4540                            ctx.options->key.vs_common_out.as_ngg) {
4541                         gfx10_ngg_gs_emit_epilogue_2(&ctx);
4542                 }
4543
4544                 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
4545                         shader_info->gs.gsvs_vertex_size = ctx.gsvs_vertex_size;
4546                         shader_info->gs.max_gsvs_emit_size = ctx.max_gsvs_emit_size;
4547                 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
4548                         shader_info->tcs.num_patches = ctx.tcs_num_patches;
4549                         shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx);
4550                 }
4551         }
4552
4553         LLVMBuildRetVoid(ctx.ac.builder);
4554
4555         if (options->dump_preoptir) {
4556                 fprintf(stderr, "%s LLVM IR:\n\n",
4557                         radv_get_shader_name(shader_info,
4558                                              shaders[shader_count - 1]->info.stage));
4559                 ac_dump_module(ctx.ac.module);
4560                 fprintf(stderr, "\n");
4561         }
4562
4563         ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options);
4564
4565         if (shader_count == 1)
4566                 ac_nir_eliminate_const_vs_outputs(&ctx);
4567
4568         if (options->dump_shader) {
4569                 ctx.shader_info->private_mem_vgprs =
4570                         ac_count_scratch_private_memory(ctx.main_function);
4571         }
4572
4573         return ctx.ac.module;
4574 }
4575
4576 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
4577 {
4578         unsigned *retval = (unsigned *)context;
4579         LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
4580         char *description = LLVMGetDiagInfoDescription(di);
4581
4582         if (severity == LLVMDSError) {
4583                 *retval = 1;
4584                 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n",
4585                         description);
4586         }
4587
4588         LLVMDisposeMessage(description);
4589 }
4590
4591 static unsigned radv_llvm_compile(LLVMModuleRef M,
4592                                   char **pelf_buffer, size_t *pelf_size,
4593                                   struct ac_llvm_compiler *ac_llvm)
4594 {
4595         unsigned retval = 0;
4596         LLVMContextRef llvm_ctx;
4597
4598         /* Setup Diagnostic Handler*/
4599         llvm_ctx = LLVMGetModuleContext(M);
4600
4601         LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler,
4602                                         &retval);
4603
4604         /* Compile IR*/
4605         if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
4606                 retval = 1;
4607         return retval;
4608 }
4609
4610 static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm,
4611                                    LLVMModuleRef llvm_module,
4612                                    struct radv_shader_binary **rbinary,
4613                                    struct radv_shader_variant_info *shader_info,
4614                                    gl_shader_stage stage,
4615                                    const char *name,
4616                                    const struct radv_nir_compiler_options *options)
4617 {
4618         char *elf_buffer = NULL;
4619         size_t elf_size = 0;
4620         char *llvm_ir_string = NULL;
4621
4622         if (options->dump_shader) {
4623                 fprintf(stderr, "%s LLVM IR:\n\n", name);
4624                 ac_dump_module(llvm_module);
4625                 fprintf(stderr, "\n");
4626         }
4627
4628         if (options->record_llvm_ir) {
4629                 char *llvm_ir = LLVMPrintModuleToString(llvm_module);
4630                 llvm_ir_string = strdup(llvm_ir);
4631                 LLVMDisposeMessage(llvm_ir);
4632         }
4633
4634         int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
4635         if (v) {
4636                 fprintf(stderr, "compile failed\n");
4637         }
4638
4639         LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
4640         LLVMDisposeModule(llvm_module);
4641         LLVMContextDispose(ctx);
4642
4643         size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
4644         size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
4645         struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
4646         memcpy(rbin->data,  elf_buffer, elf_size);
4647         if (llvm_ir_string)
4648                 memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
4649
4650         rbin->base.type = RADV_BINARY_TYPE_RTLD;
4651         rbin->base.stage = stage;
4652         rbin->base.total_size = alloc_size;
4653         rbin->elf_size = elf_size;
4654         rbin->llvm_ir_size = llvm_ir_size;
4655         *rbinary = &rbin->base;
4656
4657         free(llvm_ir_string);
4658         free(elf_buffer);
4659 }
4660
4661 static void
4662 ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_shader *nir, const struct radv_nir_compiler_options *options)
4663 {
4664         switch (nir->info.stage) {
4665         case MESA_SHADER_COMPUTE:
4666                 for (int i = 0; i < 3; ++i)
4667                         shader_info->cs.block_size[i] = nir->info.cs.local_size[i];
4668                 break;
4669         case MESA_SHADER_FRAGMENT:
4670                 shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
4671                 shader_info->fs.post_depth_coverage = nir->info.fs.post_depth_coverage;
4672                 break;
4673         case MESA_SHADER_GEOMETRY:
4674                 shader_info->gs.vertices_in = nir->info.gs.vertices_in;
4675                 shader_info->gs.vertices_out = nir->info.gs.vertices_out;
4676                 shader_info->gs.output_prim = nir->info.gs.output_primitive;
4677                 shader_info->gs.invocations = nir->info.gs.invocations;
4678                 break;
4679         case MESA_SHADER_TESS_EVAL:
4680                 shader_info->tes.primitive_mode = nir->info.tess.primitive_mode;
4681                 shader_info->tes.spacing = nir->info.tess.spacing;
4682                 shader_info->tes.ccw = nir->info.tess.ccw;
4683                 shader_info->tes.point_mode = nir->info.tess.point_mode;
4684                 shader_info->tes.as_es = options->key.vs_common_out.as_es;
4685                 shader_info->tes.export_prim_id = options->key.vs_common_out.export_prim_id;
4686                 shader_info->is_ngg = options->key.vs_common_out.as_ngg;
4687                 break;
4688         case MESA_SHADER_TESS_CTRL:
4689                 shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
4690                 break;
4691         case MESA_SHADER_VERTEX:
4692                 shader_info->vs.as_es = options->key.vs_common_out.as_es;
4693                 shader_info->vs.as_ls = options->key.vs_common_out.as_ls;
4694                 shader_info->vs.export_prim_id = options->key.vs_common_out.export_prim_id;
4695                 shader_info->is_ngg = options->key.vs_common_out.as_ngg;
4696                 break;
4697         default:
4698                 break;
4699         }
4700 }
4701
4702 void
4703 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
4704                         struct radv_shader_binary **rbinary,
4705                         struct radv_shader_variant_info *shader_info,
4706                         struct nir_shader *const *nir,
4707                         int nir_count,
4708                         const struct radv_nir_compiler_options *options)
4709 {
4710
4711         LLVMModuleRef llvm_module;
4712
4713         llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info,
4714                                                options);
4715
4716         ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, shader_info,
4717                                nir[nir_count - 1]->info.stage,
4718                                radv_get_shader_name(shader_info,
4719                                                     nir[nir_count - 1]->info.stage),
4720                                options);
4721
4722         for (int i = 0; i < nir_count; ++i)
4723                 ac_fill_shader_info(shader_info, nir[i], options);
4724
4725         /* Determine the ES type (VS or TES) for the GS on GFX9. */
4726         if (options->chip_class >= GFX9) {
4727                 if (nir_count == 2 &&
4728                     nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
4729                         shader_info->gs.es_type = nir[0]->info.stage;
4730                 }
4731         }
4732 }
4733
4734 static void
4735 ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
4736 {
4737         LLVMValueRef vtx_offset =
4738                 LLVMBuildMul(ctx->ac.builder, ctx->abi.vertex_id,
4739                              LLVMConstInt(ctx->ac.i32, 4, false), "");
4740         LLVMValueRef stream_id;
4741
4742         /* Fetch the vertex stream ID. */
4743         if (ctx->shader_info->info.so.num_outputs) {
4744                 stream_id =
4745                         ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2);
4746         } else {
4747                 stream_id = ctx->ac.i32_0;
4748         }
4749
4750         LLVMBasicBlockRef end_bb;
4751         LLVMValueRef switch_inst;
4752
4753         end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context,
4754                                                ctx->main_function, "end");
4755         switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
4756
4757         for (unsigned stream = 0; stream < 4; stream++) {
4758                 unsigned num_components =
4759                         ctx->shader_info->info.gs.num_stream_output_components[stream];
4760                 LLVMBasicBlockRef bb;
4761                 unsigned offset;
4762
4763                 if (!num_components)
4764                         continue;
4765
4766                 if (stream > 0 && !ctx->shader_info->info.so.num_outputs)
4767                         continue;
4768
4769                 bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
4770                 LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
4771                 LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
4772
4773                 offset = 0;
4774                 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
4775                         unsigned output_usage_mask =
4776                                 ctx->shader_info->info.gs.output_usage_mask[i];
4777                         unsigned output_stream =
4778                                 ctx->shader_info->info.gs.output_streams[i];
4779                         int length = util_last_bit(output_usage_mask);
4780
4781                         if (!(ctx->output_mask & (1ull << i)) ||
4782                             output_stream != stream)
4783                                 continue;
4784
4785                         for (unsigned j = 0; j < length; j++) {
4786                                 LLVMValueRef value, soffset;
4787
4788                                 if (!(output_usage_mask & (1 << j)))
4789                                         continue;
4790
4791                                 soffset = LLVMConstInt(ctx->ac.i32,
4792                                                        offset *
4793                                                        ctx->gs_max_out_vertices * 16 * 4, false);
4794
4795                                 offset++;
4796
4797                                 value = ac_build_buffer_load(&ctx->ac,
4798                                                              ctx->gsvs_ring[0],
4799                                                              1, ctx->ac.i32_0,
4800                                                              vtx_offset, soffset,
4801                                                              0, ac_glc | ac_slc, true, false);
4802
4803                                 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
4804                                 if (ac_get_type_size(type) == 2) {
4805                                         value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
4806                                         value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
4807                                 }
4808
4809                                 LLVMBuildStore(ctx->ac.builder,
4810                                                ac_to_float(&ctx->ac, value), ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
4811                         }
4812                 }
4813
4814                 if (ctx->shader_info->info.so.num_outputs)
4815                         radv_emit_streamout(ctx, stream);
4816
4817                 if (stream == 0) {
4818                         handle_vs_outputs_post(ctx, false, true,
4819                                                &ctx->shader_info->vs.outinfo);
4820                 }
4821
4822                 LLVMBuildBr(ctx->ac.builder, end_bb);
4823         }
4824
4825         LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
4826 }
4827
4828 void
4829 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
4830                             struct nir_shader *geom_shader,
4831                             struct radv_shader_binary **rbinary,
4832                             struct radv_shader_variant_info *shader_info,
4833                             const struct radv_nir_compiler_options *options)
4834 {
4835         struct radv_shader_context ctx = {0};
4836         ctx.options = options;
4837         ctx.shader_info = shader_info;
4838
4839         enum ac_float_mode float_mode =
4840                 options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
4841                                        AC_FLOAT_MODE_DEFAULT;
4842
4843         ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
4844                              options->family, float_mode, 64);
4845         ctx.context = ctx.ac.context;
4846
4847         ctx.is_gs_copy_shader = true;
4848         ctx.stage = MESA_SHADER_VERTEX;
4849
4850         radv_nir_shader_info_pass(geom_shader, options, &shader_info->info);
4851
4852         create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX);
4853
4854         ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
4855         ac_setup_rings(&ctx);
4856
4857         nir_foreach_variable(variable, &geom_shader->outputs) {
4858                 scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
4859                 ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader,
4860                                              variable, MESA_SHADER_VERTEX);
4861         }
4862
4863         ac_gs_copy_shader_emit(&ctx);
4864
4865         LLVMBuildRetVoid(ctx.ac.builder);
4866
4867         ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options);
4868
4869         ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, shader_info,
4870                                MESA_SHADER_VERTEX, "GS Copy Shader", options);
4871         (*rbinary)->is_gs_copy_shader = true;
4872
4873 }