src/amd/compiler/aco_instruction_selection_setup.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <array>
  26 #include <unordered_map>
  27 #include "aco_ir.h"
  28 #include "nir.h"
  29 #include "vulkan/radv_shader.h"
  30 #include "vulkan/radv_descriptor_set.h"
  31 #include "sid.h"
  32 #include "ac_exp_param.h"
  33 #include "ac_shader_util.h"
  34
  35 #include "util/u_math.h"
  36
  37 #define MAX_INLINE_PUSH_CONSTS 8
  38
  39 namespace aco {
  40
  41 enum fs_input {
  42    persp_sample_p1,
  43    persp_sample_p2,
  44    persp_center_p1,
  45    persp_center_p2,
  46    persp_centroid_p1,
  47    persp_centroid_p2,
  48    persp_pull_model,
  49    linear_sample_p1,
  50    linear_sample_p2,
  51    linear_center_p1,
  52    linear_center_p2,
  53    linear_centroid_p1,
  54    linear_centroid_p2,
  55    line_stipple,
  56    frag_pos_0,
  57    frag_pos_1,
  58    frag_pos_2,
  59    frag_pos_3,
  60    front_face,
  61    ancillary,
  62    sample_coverage,
  63    fixed_pt,
  64    max_inputs,
  65 };
  66
  67 struct vs_output_state {
  68    uint8_t mask[VARYING_SLOT_VAR31 + 1];
  69    Temp outputs[VARYING_SLOT_VAR31 + 1][4];
  70 };
  71
  72 struct isel_context {
  73    const struct radv_nir_compiler_options *options;
  74    Program *program;
  75    nir_shader *shader;
  76    uint32_t constant_data_offset;
  77    Block *block;
  78    bool *divergent_vals;
  79    std::unique_ptr<Temp[]> allocated;
  80    std::unordered_map<unsigned, std::array<Temp,4>> allocated_vec;
  81    Stage stage; /* Stage */
  82    bool has_gfx10_wave64_bpermute = false;
  83    struct {
  84       bool has_branch;
  85       uint16_t loop_nest_depth = 0;
  86       struct {
  87          unsigned header_idx;
  88          Block* exit;
  89          bool has_divergent_continue = false;
  90          bool has_divergent_branch = false;
  91       } parent_loop;
  92       struct {
  93          bool is_divergent = false;
  94       } parent_if;
  95       bool exec_potentially_empty = false;
  96    } cf_info;
  97
  98    /* inputs common for merged stages */
  99    Temp merged_wave_info = Temp(0, s1);
 100
 101    /* FS inputs */
 102    bool fs_vgpr_args[fs_input::max_inputs];
 103    Temp fs_inputs[fs_input::max_inputs];
 104    Temp prim_mask = Temp(0, s1);
 105    Temp descriptor_sets[MAX_SETS];
 106    Temp push_constants = Temp(0, s1);
 107    Temp inline_push_consts[MAX_INLINE_PUSH_CONSTS];
 108    unsigned num_inline_push_consts = 0;
 109    unsigned base_inline_push_consts = 0;
 110
 111    /* VS inputs */
 112    Temp vertex_buffers = Temp(0, s1);
 113    Temp base_vertex = Temp(0, s1);
 114    Temp start_instance = Temp(0, s1);
 115    Temp draw_id = Temp(0, s1);
 116    Temp view_index = Temp(0, s1);
 117    Temp es2gs_offset = Temp(0, s1);
 118    Temp vertex_id = Temp(0, v1);
 119    Temp rel_auto_id = Temp(0, v1);
 120    Temp instance_id = Temp(0, v1);
 121    Temp vs_prim_id = Temp(0, v1);
 122    bool needs_instance_id;
 123
 124    /* CS inputs */
 125    Temp num_workgroups = Temp(0, s3);
 126    Temp workgroup_ids[3] = {Temp(0, s1), Temp(0, s1), Temp(0, s1)};
 127    Temp tg_size = Temp(0, s1);
 128    Temp local_invocation_ids = Temp(0, v3);
 129
 130    /* VS output information */
 131    unsigned num_clip_distances;
 132    unsigned num_cull_distances;
 133    vs_output_state vs_output;
 134
 135    /* Streamout */
 136    Temp streamout_buffers = Temp(0, s1);
 137    Temp streamout_write_idx = Temp(0, s1);
 138    Temp streamout_config = Temp(0, s1);
 139    Temp streamout_offset[4] = {Temp(0, s1), Temp(0, s1), Temp(0, s1), Temp(0, s1)};
 140 };
 141
 142 fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
 143 {
 144    switch (interp) {
 145    case INTERP_MODE_SMOOTH:
 146    case INTERP_MODE_NONE:
 147       if (intrin == nir_intrinsic_load_barycentric_pixel ||
 148           intrin == nir_intrinsic_load_barycentric_at_sample ||
 149           intrin == nir_intrinsic_load_barycentric_at_offset)
 150          return fs_input::persp_center_p1;
 151       else if (intrin == nir_intrinsic_load_barycentric_centroid)
 152          return fs_input::persp_centroid_p1;
 153       else if (intrin == nir_intrinsic_load_barycentric_sample)
 154          return fs_input::persp_sample_p1;
 155       break;
 156    case INTERP_MODE_NOPERSPECTIVE:
 157       if (intrin == nir_intrinsic_load_barycentric_pixel)
 158          return fs_input::linear_center_p1;
 159       else if (intrin == nir_intrinsic_load_barycentric_centroid)
 160          return fs_input::linear_centroid_p1;
 161       else if (intrin == nir_intrinsic_load_barycentric_sample)
 162          return fs_input::linear_sample_p1;
 163       break;
 164    default:
 165       break;
 166    }
 167    return fs_input::max_inputs;
 168 }
 169
 170 void init_context(isel_context *ctx, nir_shader *shader)
 171 {
 172    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 173
 174    ctx->shader = shader;
 175    ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
 176
 177    std::unique_ptr<Temp[]> allocated{new Temp[impl->ssa_alloc]()};
 178    memset(&ctx->fs_vgpr_args, false, sizeof(ctx->fs_vgpr_args));
 179
 180    bool done = false;
 181    while (!done) {
 182       done = true;
 183       nir_foreach_block(block, impl) {
 184          nir_foreach_instr(instr, block) {
 185             switch(instr->type) {
 186             case nir_instr_type_alu: {
 187                nir_alu_instr *alu_instr = nir_instr_as_alu(instr);
 188                unsigned size =  alu_instr->dest.dest.ssa.num_components;
 189                if (alu_instr->dest.dest.ssa.bit_size == 64)
 190                   size *= 2;
 191                RegType type = RegType::sgpr;
 192                switch(alu_instr->op) {
 193                   case nir_op_fmul:
 194                   case nir_op_fadd:
 195                   case nir_op_fsub:
 196                   case nir_op_fmax:
 197                   case nir_op_fmin:
 198                   case nir_op_fmax3:
 199                   case nir_op_fmin3:
 200                   case nir_op_fmed3:
 201                   case nir_op_fneg:
 202                   case nir_op_fabs:
 203                   case nir_op_fsat:
 204                   case nir_op_fsign:
 205                   case nir_op_frcp:
 206                   case nir_op_frsq:
 207                   case nir_op_fsqrt:
 208                   case nir_op_fexp2:
 209                   case nir_op_flog2:
 210                   case nir_op_ffract:
 211                   case nir_op_ffloor:
 212                   case nir_op_fceil:
 213                   case nir_op_ftrunc:
 214                   case nir_op_fround_even:
 215                   case nir_op_fsin:
 216                   case nir_op_fcos:
 217                   case nir_op_f2f32:
 218                   case nir_op_f2f64:
 219                   case nir_op_u2f32:
 220                   case nir_op_u2f64:
 221                   case nir_op_i2f32:
 222                   case nir_op_i2f64:
 223                   case nir_op_pack_half_2x16:
 224                   case nir_op_unpack_half_2x16_split_x:
 225                   case nir_op_unpack_half_2x16_split_y:
 226                   case nir_op_fddx:
 227                   case nir_op_fddy:
 228                   case nir_op_fddx_fine:
 229                   case nir_op_fddy_fine:
 230                   case nir_op_fddx_coarse:
 231                   case nir_op_fddy_coarse:
 232                   case nir_op_fquantize2f16:
 233                   case nir_op_ldexp:
 234                   case nir_op_frexp_sig:
 235                   case nir_op_frexp_exp:
 236                   case nir_op_cube_face_index:
 237                   case nir_op_cube_face_coord:
 238                      type = RegType::vgpr;
 239                      break;
 240                   case nir_op_flt:
 241                   case nir_op_fge:
 242                   case nir_op_feq:
 243                   case nir_op_fne:
 244                   case nir_op_ilt:
 245                   case nir_op_ige:
 246                   case nir_op_ult:
 247                   case nir_op_uge:
 248                   case nir_op_ieq:
 249                   case nir_op_ine:
 250                   case nir_op_i2b1:
 251                      size = 2;
 252                      break;
 253                   case nir_op_f2i64:
 254                   case nir_op_f2u64:
 255                   case nir_op_b2i32:
 256                   case nir_op_b2f32:
 257                   case nir_op_f2i32:
 258                   case nir_op_f2u32:
 259                      type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
 260                      break;
 261                   case nir_op_bcsel:
 262                      if (alu_instr->dest.dest.ssa.bit_size == 1) {
 263                         size = 2;
 264                      } else {
 265                         if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) {
 266                            type = RegType::vgpr;
 267                         } else {
 268                            if (allocated[alu_instr->src[1].src.ssa->index].type() == RegType::vgpr ||
 269                                allocated[alu_instr->src[2].src.ssa->index].type() == RegType::vgpr) {
 270                               type = RegType::vgpr;
 271                            }
 272                         }
 273                         if (alu_instr->src[1].src.ssa->num_components == 1 && alu_instr->src[2].src.ssa->num_components == 1) {
 274                            assert(allocated[alu_instr->src[1].src.ssa->index].size() == allocated[alu_instr->src[2].src.ssa->index].size());
 275                            size = allocated[alu_instr->src[1].src.ssa->index].size();
 276                         }
 277                      }
 278                      break;
 279                   case nir_op_mov:
 280                      if (alu_instr->dest.dest.ssa.bit_size == 1) {
 281                         size = 2;
 282                      } else {
 283                         type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
 284                      }
 285                      break;
 286                   default:
 287                      if (alu_instr->dest.dest.ssa.bit_size == 1) {
 288                         size = 2;
 289                      } else {
 290                         for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
 291                            if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
 292                               type = RegType::vgpr;
 293                         }
 294                      }
 295                      break;
 296                }
 297                allocated[alu_instr->dest.dest.ssa.index] = Temp(0, RegClass(type, size));
 298                break;
 299             }
 300             case nir_instr_type_load_const: {
 301                unsigned size = nir_instr_as_load_const(instr)->def.num_components;
 302                if (nir_instr_as_load_const(instr)->def.bit_size == 64)
 303                   size *= 2;
 304                else if (nir_instr_as_load_const(instr)->def.bit_size == 1)
 305                   size *= 2;
 306                allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
 307                break;
 308             }
 309             case nir_instr_type_intrinsic: {
 310                nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
 311                if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
 312                   break;
 313                unsigned size =  intrinsic->dest.ssa.num_components;
 314                if (intrinsic->dest.ssa.bit_size == 64)
 315                   size *= 2;
 316                RegType type = RegType::sgpr;
 317                switch(intrinsic->intrinsic) {
 318                   case nir_intrinsic_load_push_constant:
 319                   case nir_intrinsic_load_work_group_id:
 320                   case nir_intrinsic_load_num_work_groups:
 321                   case nir_intrinsic_load_subgroup_id:
 322                   case nir_intrinsic_load_num_subgroups:
 323                   case nir_intrinsic_load_first_vertex:
 324                   case nir_intrinsic_load_base_instance:
 325                   case nir_intrinsic_get_buffer_size:
 326                   case nir_intrinsic_vote_all:
 327                   case nir_intrinsic_vote_any:
 328                   case nir_intrinsic_read_first_invocation:
 329                   case nir_intrinsic_read_invocation:
 330                   case nir_intrinsic_first_invocation:
 331                      type = RegType::sgpr;
 332                      if (intrinsic->dest.ssa.bit_size == 1)
 333                         size = 2;
 334                      break;
 335                   case nir_intrinsic_ballot:
 336                      type = RegType::sgpr;
 337                      size = 2;
 338                      break;
 339                   case nir_intrinsic_load_sample_id:
 340                   case nir_intrinsic_load_sample_mask_in:
 341                   case nir_intrinsic_load_input:
 342                   case nir_intrinsic_load_vertex_id:
 343                   case nir_intrinsic_load_vertex_id_zero_base:
 344                   case nir_intrinsic_load_barycentric_sample:
 345                   case nir_intrinsic_load_barycentric_pixel:
 346                   case nir_intrinsic_load_barycentric_centroid:
 347                   case nir_intrinsic_load_barycentric_at_sample:
 348                   case nir_intrinsic_load_barycentric_at_offset:
 349                   case nir_intrinsic_load_interpolated_input:
 350                   case nir_intrinsic_load_frag_coord:
 351                   case nir_intrinsic_load_sample_pos:
 352                   case nir_intrinsic_load_layer_id:
 353                   case nir_intrinsic_load_local_invocation_id:
 354                   case nir_intrinsic_load_local_invocation_index:
 355                   case nir_intrinsic_load_subgroup_invocation:
 356                   case nir_intrinsic_write_invocation_amd:
 357                   case nir_intrinsic_mbcnt_amd:
 358                   case nir_intrinsic_load_instance_id:
 359                   case nir_intrinsic_ssbo_atomic_add:
 360                   case nir_intrinsic_ssbo_atomic_imin:
 361                   case nir_intrinsic_ssbo_atomic_umin:
 362                   case nir_intrinsic_ssbo_atomic_imax:
 363                   case nir_intrinsic_ssbo_atomic_umax:
 364                   case nir_intrinsic_ssbo_atomic_and:
 365                   case nir_intrinsic_ssbo_atomic_or:
 366                   case nir_intrinsic_ssbo_atomic_xor:
 367                   case nir_intrinsic_ssbo_atomic_exchange:
 368                   case nir_intrinsic_ssbo_atomic_comp_swap:
 369                   case nir_intrinsic_image_deref_atomic_add:
 370                   case nir_intrinsic_image_deref_atomic_umin:
 371                   case nir_intrinsic_image_deref_atomic_imin:
 372                   case nir_intrinsic_image_deref_atomic_umax:
 373                   case nir_intrinsic_image_deref_atomic_imax:
 374                   case nir_intrinsic_image_deref_atomic_and:
 375                   case nir_intrinsic_image_deref_atomic_or:
 376                   case nir_intrinsic_image_deref_atomic_xor:
 377                   case nir_intrinsic_image_deref_atomic_exchange:
 378                   case nir_intrinsic_image_deref_atomic_comp_swap:
 379                   case nir_intrinsic_image_deref_size:
 380                   case nir_intrinsic_shared_atomic_add:
 381                   case nir_intrinsic_shared_atomic_imin:
 382                   case nir_intrinsic_shared_atomic_umin:
 383                   case nir_intrinsic_shared_atomic_imax:
 384                   case nir_intrinsic_shared_atomic_umax:
 385                   case nir_intrinsic_shared_atomic_and:
 386                   case nir_intrinsic_shared_atomic_or:
 387                   case nir_intrinsic_shared_atomic_xor:
 388                   case nir_intrinsic_shared_atomic_exchange:
 389                   case nir_intrinsic_shared_atomic_comp_swap:
 390                   case nir_intrinsic_load_scratch:
 391                      type = RegType::vgpr;
 392                      break;
 393                   case nir_intrinsic_shuffle:
 394                   case nir_intrinsic_quad_broadcast:
 395                   case nir_intrinsic_quad_swap_horizontal:
 396                   case nir_intrinsic_quad_swap_vertical:
 397                   case nir_intrinsic_quad_swap_diagonal:
 398                   case nir_intrinsic_quad_swizzle_amd:
 399                   case nir_intrinsic_masked_swizzle_amd:
 400                   case nir_intrinsic_inclusive_scan:
 401                   case nir_intrinsic_exclusive_scan:
 402                      if (intrinsic->dest.ssa.bit_size == 1) {
 403                         size = 2;
 404                         type = RegType::sgpr;
 405                      } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
 406                         type = RegType::sgpr;
 407                      } else {
 408                         type = RegType::vgpr;
 409                      }
 410                      break;
 411                   case nir_intrinsic_load_view_index:
 412                      type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
 413                      break;
 414                   case nir_intrinsic_load_front_face:
 415                   case nir_intrinsic_load_helper_invocation:
 416                   case nir_intrinsic_is_helper_invocation:
 417                      type = RegType::sgpr;
 418                      size = 2;
 419                      break;
 420                   case nir_intrinsic_reduce:
 421                      if (intrinsic->dest.ssa.bit_size == 1) {
 422                         size = 2;
 423                         type = RegType::sgpr;
 424                      } else if (nir_intrinsic_cluster_size(intrinsic) == 0 ||
 425                          !ctx->divergent_vals[intrinsic->dest.ssa.index]) {
 426                         type = RegType::sgpr;
 427                      } else {
 428                         type = RegType::vgpr;
 429                      }
 430                      break;
 431                   case nir_intrinsic_load_ubo:
 432                   case nir_intrinsic_load_ssbo:
 433                   case nir_intrinsic_load_global:
 434                   case nir_intrinsic_vulkan_resource_index:
 435                      type = ctx->divergent_vals[intrinsic->dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
 436                      break;
 437                   /* due to copy propagation, the swizzled imov is removed if num dest components == 1 */
 438                   case nir_intrinsic_load_shared:
 439                      if (ctx->divergent_vals[intrinsic->dest.ssa.index])
 440                         type = RegType::vgpr;
 441                      else
 442                         type = RegType::sgpr;
 443                      break;
 444                   default:
 445                      for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) {
 446                         if (allocated[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
 447                            type = RegType::vgpr;
 448                      }
 449                      break;
 450                }
 451                allocated[intrinsic->dest.ssa.index] = Temp(0, RegClass(type, size));
 452
 453                switch(intrinsic->intrinsic) {
 454                   case nir_intrinsic_load_barycentric_sample:
 455                   case nir_intrinsic_load_barycentric_pixel:
 456                   case nir_intrinsic_load_barycentric_centroid:
 457                   case nir_intrinsic_load_barycentric_at_sample:
 458                   case nir_intrinsic_load_barycentric_at_offset: {
 459                      glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
 460                      ctx->fs_vgpr_args[get_interp_input(intrinsic->intrinsic, mode)] = true;
 461                      break;
 462                   }
 463                   case nir_intrinsic_load_front_face:
 464                      ctx->fs_vgpr_args[fs_input::front_face] = true;
 465                      break;
 466                   case nir_intrinsic_load_frag_coord:
 467                   case nir_intrinsic_load_sample_pos: {
 468                      uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
 469                      for (unsigned i = 0; i < 4; i++) {
 470                         if (mask & (1 << i))
 471                            ctx->fs_vgpr_args[fs_input::frag_pos_0 + i] = true;
 472
 473                      }
 474                      break;
 475                   }
 476                   case nir_intrinsic_load_sample_id:
 477                      ctx->fs_vgpr_args[fs_input::ancillary] = true;
 478                      break;
 479                   case nir_intrinsic_load_sample_mask_in:
 480                      ctx->fs_vgpr_args[fs_input::ancillary] = true;
 481                      ctx->fs_vgpr_args[fs_input::sample_coverage] = true;
 482                      break;
 483                   default:
 484                      break;
 485                }
 486                break;
 487             }
 488             case nir_instr_type_tex: {
 489                nir_tex_instr* tex = nir_instr_as_tex(instr);
 490                unsigned size = tex->dest.ssa.num_components;
 491
 492                if (tex->dest.ssa.bit_size == 64)
 493                   size *= 2;
 494                if (tex->op == nir_texop_texture_samples)
 495                   assert(!ctx->divergent_vals[tex->dest.ssa.index]);
 496                if (ctx->divergent_vals[tex->dest.ssa.index])
 497                   allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::vgpr, size));
 498                else
 499                   allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::sgpr, size));
 500                break;
 501             }
 502             case nir_instr_type_parallel_copy: {
 503                nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) {
 504                   allocated[entry->dest.ssa.index] = allocated[entry->src.ssa->index];
 505                }
 506                break;
 507             }
 508             case nir_instr_type_ssa_undef: {
 509                unsigned size = nir_instr_as_ssa_undef(instr)->def.num_components;
 510                if (nir_instr_as_ssa_undef(instr)->def.bit_size == 64)
 511                   size *= 2;
 512                allocated[nir_instr_as_ssa_undef(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
 513                break;
 514             }
 515             case nir_instr_type_phi: {
 516                nir_phi_instr* phi = nir_instr_as_phi(instr);
 517                RegType type;
 518                unsigned size = phi->dest.ssa.num_components;
 519
 520                if (phi->dest.ssa.bit_size == 1) {
 521                   assert(size == 1 && "multiple components not yet supported on boolean phis.");
 522                   type = RegType::sgpr;
 523                   size *= 2;
 524                   allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size));
 525                   break;
 526                }
 527
 528                if (ctx->divergent_vals[phi->dest.ssa.index]) {
 529                   type = RegType::vgpr;
 530                } else {
 531                   type = RegType::sgpr;
 532                   nir_foreach_phi_src (src, phi) {
 533                      if (allocated[src->src.ssa->index].type() == RegType::vgpr)
 534                         type = RegType::vgpr;
 535                      if (allocated[src->src.ssa->index].type() == RegType::none)
 536                         done = false;
 537                   }
 538                }
 539
 540                size *= phi->dest.ssa.bit_size == 64 ? 2 : 1;
 541                RegClass rc = RegClass(type, size);
 542                if (rc != allocated[phi->dest.ssa.index].regClass()) {
 543                   done = false;
 544                } else {
 545                   nir_foreach_phi_src(src, phi)
 546                      assert(allocated[src->src.ssa->index].size() == rc.size());
 547                }
 548                allocated[phi->dest.ssa.index] = Temp(0, rc);
 549                break;
 550             }
 551             default:
 552                break;
 553             }
 554          }
 555       }
 556    }
 557
 558    for (unsigned i = 0; i < impl->ssa_alloc; i++)
 559       allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass());
 560
 561    ctx->allocated.reset(allocated.release());
 562 }
 563
 564 struct user_sgpr_info {
 565    uint8_t num_sgpr;
 566    uint8_t remaining_sgprs;
 567    uint8_t user_sgpr_idx;
 568    bool need_ring_offsets;
 569    bool indirect_all_descriptor_sets;
 570 };
 571
 572 static void allocate_inline_push_consts(isel_context *ctx,
 573                                         user_sgpr_info& user_sgpr_info)
 574 {
 575    uint8_t remaining_sgprs = user_sgpr_info.remaining_sgprs;
 576
 577    /* Only supported if shaders use push constants. */
 578    if (ctx->program->info->min_push_constant_used == UINT8_MAX)
 579       return;
 580
 581    /* Only supported if shaders don't have indirect push constants. */
 582    if (ctx->program->info->has_indirect_push_constants)
 583       return;
 584
 585    /* Only supported for 32-bit push constants. */
 586    //TODO: it's possible that some day, the load/store vectorization could make this inaccurate
 587    if (!ctx->program->info->has_only_32bit_push_constants)
 588       return;
 589
 590    uint8_t num_push_consts =
 591       (ctx->program->info->max_push_constant_used -
 592        ctx->program->info->min_push_constant_used) / 4;
 593
 594    /* Check if the number of user SGPRs is large enough. */
 595    if (num_push_consts < remaining_sgprs) {
 596       ctx->program->info->num_inline_push_consts = num_push_consts;
 597    } else {
 598       ctx->program->info->num_inline_push_consts = remaining_sgprs;
 599    }
 600
 601    /* Clamp to the maximum number of allowed inlined push constants. */
 602    if (ctx->program->info->num_inline_push_consts > MAX_INLINE_PUSH_CONSTS)
 603       ctx->program->info->num_inline_push_consts = MAX_INLINE_PUSH_CONSTS;
 604
 605    if (ctx->program->info->num_inline_push_consts == num_push_consts &&
 606        !ctx->program->info->loads_dynamic_offsets) {
 607       /* Disable the default push constants path if all constants are
 608        * inlined and if shaders don't use dynamic descriptors.
 609        */
 610       ctx->program->info->loads_push_constants = false;
 611       user_sgpr_info.num_sgpr--;
 612       user_sgpr_info.remaining_sgprs++;
 613    }
 614
 615    ctx->program->info->base_inline_push_consts =
 616       ctx->program->info->min_push_constant_used / 4;
 617
 618    user_sgpr_info.num_sgpr += ctx->program->info->num_inline_push_consts;
 619    user_sgpr_info.remaining_sgprs -= ctx->program->info->num_inline_push_consts;
 620 }
 621
 622 static void allocate_user_sgprs(isel_context *ctx,
 623                                 bool needs_view_index, user_sgpr_info& user_sgpr_info)
 624 {
 625    memset(&user_sgpr_info, 0, sizeof(struct user_sgpr_info));
 626    uint32_t user_sgpr_count = 0;
 627
 628    /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
 629    if (ctx->stage != fragment_fs &&
 630        ctx->stage != compute_cs
 631        /*|| ctx->is_gs_copy_shader */)
 632       user_sgpr_info.need_ring_offsets = true;
 633
 634    if (ctx->stage == fragment_fs &&
 635        ctx->program->info->ps.needs_sample_positions)
 636       user_sgpr_info.need_ring_offsets = true;
 637
 638    /* 2 user sgprs will nearly always be allocated for scratch/rings */
 639    user_sgpr_count += 2;
 640
 641    switch (ctx->stage) {
 642    case vertex_vs:
 643    /* if (!ctx->is_gs_copy_shader) */ {
 644          if (ctx->program->info->vs.has_vertex_buffers)
 645             user_sgpr_count++;
 646          user_sgpr_count += ctx->program->info->vs.needs_draw_id ? 3 : 2;
 647       }
 648       break;
 649    case fragment_fs:
 650       //user_sgpr_count += ctx->program->info->ps.needs_sample_positions;
 651       break;
 652    case compute_cs:
 653       if (ctx->program->info->cs.uses_grid_size)
 654          user_sgpr_count += 3;
 655       break;
 656    default:
 657       unreachable("Shader stage not implemented");
 658    }
 659
 660    if (needs_view_index)
 661       user_sgpr_count++;
 662
 663    if (ctx->program->info->loads_push_constants)
 664       user_sgpr_count += 1; /* we use 32bit pointers */
 665
 666    if (ctx->program->info->so.num_outputs)
 667       user_sgpr_count += 1; /* we use 32bit pointers */
 668
 669    uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && !(ctx->stage & hw_cs) ? 32 : 16;
 670    uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
 671    uint32_t num_desc_set = util_bitcount(ctx->program->info->desc_set_used_mask);
 672
 673    if (available_sgprs < user_sgpr_count + num_desc_set) {
 674       user_sgpr_info.indirect_all_descriptor_sets = true;
 675       user_sgpr_info.num_sgpr = user_sgpr_count + 1;
 676       user_sgpr_info.remaining_sgprs = remaining_sgprs - 1;
 677    } else {
 678       user_sgpr_info.num_sgpr = user_sgpr_count + num_desc_set;
 679       user_sgpr_info.remaining_sgprs = remaining_sgprs - num_desc_set;
 680    }
 681
 682    allocate_inline_push_consts(ctx, user_sgpr_info);
 683 }
 684
 685 #define MAX_ARGS 64
 686 struct arg_info {
 687    RegClass types[MAX_ARGS];
 688    Temp *assign[MAX_ARGS];
 689    PhysReg reg[MAX_ARGS];
 690    unsigned array_params_mask;
 691    uint8_t count;
 692    uint8_t sgpr_count;
 693    uint8_t num_sgprs_used;
 694    uint8_t num_vgprs_used;
 695 };
 696
 697 static void
 698 add_arg(arg_info *info, RegClass rc, Temp *param_ptr, unsigned reg)
 699 {
 700    assert(info->count < MAX_ARGS);
 701
 702    info->assign[info->count] = param_ptr;
 703    info->types[info->count] = rc;
 704
 705    if (rc.type() == RegType::sgpr) {
 706       info->num_sgprs_used += rc.size();
 707       info->sgpr_count++;
 708       info->reg[info->count] = PhysReg{reg};
 709    } else {
 710       assert(rc.type() == RegType::vgpr);
 711       info->num_vgprs_used += rc.size();
 712       info->reg[info->count] = PhysReg{reg + 256};
 713    }
 714    info->count++;
 715 }
 716
 717 static void
 718 set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs)
 719 {
 720    ud_info->sgpr_idx = *sgpr_idx;
 721    ud_info->num_sgprs = num_sgprs;
 722    *sgpr_idx += num_sgprs;
 723 }
 724
 725 static void
 726 set_loc_shader(isel_context *ctx, int idx, uint8_t *sgpr_idx,
 727                uint8_t num_sgprs)
 728 {
 729    struct radv_userdata_info *ud_info = &ctx->program->info->user_sgprs_locs.shader_data[idx];
 730    assert(ud_info);
 731
 732    set_loc(ud_info, sgpr_idx, num_sgprs);
 733 }
 734
 735 static void
 736 set_loc_shader_ptr(isel_context *ctx, int idx, uint8_t *sgpr_idx)
 737 {
 738    bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
 739
 740    set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
 741 }
 742
 743 static void
 744 set_loc_desc(isel_context *ctx, int idx,  uint8_t *sgpr_idx)
 745 {
 746    struct radv_userdata_locations *locs = &ctx->program->info->user_sgprs_locs;
 747    struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
 748    assert(ud_info);
 749
 750    set_loc(ud_info, sgpr_idx, 1);
 751    locs->descriptor_sets_enabled |= 1 << idx;
 752 }
 753
 754 static void
 755 declare_global_input_sgprs(isel_context *ctx,
 756                            /* bool has_previous_stage, gl_shader_stage previous_stage, */
 757                            user_sgpr_info *user_sgpr_info,
 758                            struct arg_info *args,
 759                            Temp *desc_sets)
 760 {
 761    /* 1 for each descriptor set */
 762    if (!user_sgpr_info->indirect_all_descriptor_sets) {
 763       uint32_t mask = ctx->program->info->desc_set_used_mask;
 764       while (mask) {
 765          int i = u_bit_scan(&mask);
 766          add_arg(args, s1, &desc_sets[i], user_sgpr_info->user_sgpr_idx);
 767          set_loc_desc(ctx, i, &user_sgpr_info->user_sgpr_idx);
 768       }
 769       /* NIR->LLVM might have set this to true if RADV_DEBUG=compiletime */
 770       ctx->program->info->need_indirect_descriptor_sets = false;
 771    } else {
 772       add_arg(args, s1, desc_sets, user_sgpr_info->user_sgpr_idx);
 773       set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_info->user_sgpr_idx);
 774       ctx->program->info->need_indirect_descriptor_sets = true;
 775    }
 776
 777    if (ctx->program->info->loads_push_constants) {
 778       /* 1 for push constants and dynamic descriptors */
 779       add_arg(args, s1, &ctx->push_constants, user_sgpr_info->user_sgpr_idx);
 780       set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx);
 781    }
 782
 783    if (ctx->program->info->num_inline_push_consts) {
 784       unsigned count = ctx->program->info->num_inline_push_consts;
 785       for (unsigned i = 0; i < count; i++)
 786          add_arg(args, s1, &ctx->inline_push_consts[i], user_sgpr_info->user_sgpr_idx + i);
 787       set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx, count);
 788
 789       ctx->num_inline_push_consts = ctx->program->info->num_inline_push_consts;
 790       ctx->base_inline_push_consts = ctx->program->info->base_inline_push_consts;
 791    }
 792
 793    if (ctx->program->info->so.num_outputs) {
 794       add_arg(args, s1, &ctx->streamout_buffers, user_sgpr_info->user_sgpr_idx);
 795       set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, &user_sgpr_info->user_sgpr_idx);
 796    }
 797 }
 798
 799 static void
 800 declare_vs_input_vgprs(isel_context *ctx, struct arg_info *args)
 801 {
 802    unsigned vgpr_idx = 0;
 803    add_arg(args, v1, &ctx->vertex_id, vgpr_idx++);
 804    if (ctx->options->chip_class >= GFX10) {
 805       add_arg(args, v1, NULL, vgpr_idx++); /* unused */
 806       add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++);
 807       add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
 808    } else {
 809       if (ctx->options->key.vs.out.as_ls) {
 810          add_arg(args, v1, &ctx->rel_auto_id, vgpr_idx++);
 811          add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
 812       } else {
 813          add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
 814          add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++);
 815       }
 816       add_arg(args, v1, NULL, vgpr_idx); /* unused */
 817    }
 818 }
 819
 820 static void
 821 declare_streamout_sgprs(isel_context *ctx, struct arg_info *args, unsigned *idx)
 822 {
 823    /* Streamout SGPRs. */
 824    if (ctx->program->info->so.num_outputs) {
 825       assert(ctx->stage & hw_vs);
 826
 827       if (ctx->stage != tess_eval_vs) {
 828          add_arg(args, s1, &ctx->streamout_config, (*idx)++);
 829       } else {
 830          args->assign[args->count - 1] = &ctx->streamout_config;
 831          args->types[args->count - 1] = s1;
 832       }
 833
 834       add_arg(args, s1, &ctx->streamout_write_idx, (*idx)++);
 835    }
 836
 837    /* A streamout buffer offset is loaded if the stride is non-zero. */
 838    for (unsigned i = 0; i < 4; i++) {
 839       if (!ctx->program->info->so.strides[i])
 840          continue;
 841
 842       add_arg(args, s1, &ctx->streamout_offset[i], (*idx)++);
 843    }
 844 }
 845
 846 static bool needs_view_index_sgpr(isel_context *ctx)
 847 {
 848    switch (ctx->stage) {
 849    case vertex_vs:
 850       return ctx->program->info->needs_multiview_view_index || ctx->options->key.has_multiview_view_index;
 851    case tess_eval_vs:
 852       return ctx->program->info->needs_multiview_view_index && ctx->options->key.has_multiview_view_index;
 853    case vertex_ls:
 854    case vertex_es:
 855    case vertex_tess_control_hs:
 856    case vertex_geometry_gs:
 857    case tess_control_hs:
 858    case tess_eval_es:
 859    case tess_eval_geometry_gs:
 860    case geometry_gs:
 861       return ctx->program->info->needs_multiview_view_index;
 862    default:
 863       return false;
 864    }
 865 }
 866
 867 static inline bool
 868 add_fs_arg(isel_context *ctx, arg_info *args, unsigned &vgpr_idx, fs_input input, unsigned value, bool enable_next = false, RegClass rc = v1)
 869 {
 870    if (!ctx->fs_vgpr_args[input])
 871       return false;
 872
 873    add_arg(args, rc, &ctx->fs_inputs[input], vgpr_idx);
 874    vgpr_idx += rc.size();
 875
 876    if (enable_next) {
 877       add_arg(args, rc, &ctx->fs_inputs[input + 1], vgpr_idx);
 878       vgpr_idx += rc.size();
 879    }
 880
 881    ctx->program->config->spi_ps_input_addr |= value;
 882    ctx->program->config->spi_ps_input_ena |= value;
 883    return true;
 884 }
 885
 886 Pseudo_instruction *add_startpgm(struct isel_context *ctx)
 887 {
 888    user_sgpr_info user_sgpr_info;
 889    bool needs_view_index = needs_view_index_sgpr(ctx);
 890    allocate_user_sgprs(ctx, needs_view_index, user_sgpr_info);
 891    arg_info args = {};
 892
 893    /* this needs to be in sgprs 0 and 1 */
 894    add_arg(&args, s2, &ctx->program->private_segment_buffer, 0);
 895    set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx);
 896
 897    unsigned vgpr_idx = 0;
 898    switch (ctx->stage) {
 899    case vertex_vs: {
 900       declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
 901       if (ctx->program->info->vs.has_vertex_buffers) {
 902          add_arg(&args, s1, &ctx->vertex_buffers, user_sgpr_info.user_sgpr_idx);
 903          set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_info.user_sgpr_idx);
 904       }
 905       add_arg(&args, s1, &ctx->base_vertex, user_sgpr_info.user_sgpr_idx);
 906       add_arg(&args, s1, &ctx->start_instance, user_sgpr_info.user_sgpr_idx + 1);
 907       if (ctx->program->info->vs.needs_draw_id) {
 908          add_arg(&args, s1, &ctx->draw_id, user_sgpr_info.user_sgpr_idx + 2);
 909          set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 3);
 910       } else
 911          set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 2);
 912
 913       if (needs_view_index) {
 914          add_arg(&args, s1, &ctx->view_index, user_sgpr_info.user_sgpr_idx);
 915          set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_info.user_sgpr_idx, 1);
 916       }
 917
 918       assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
 919       unsigned idx = user_sgpr_info.user_sgpr_idx;
 920       if (ctx->options->key.vs.out.as_es)
 921          add_arg(&args, s1, &ctx->es2gs_offset, idx++);
 922       else
 923          declare_streamout_sgprs(ctx, &args, &idx);
 924
 925       add_arg(&args, s1, &ctx->program->scratch_offset, idx++);
 926
 927       declare_vs_input_vgprs(ctx, &args);
 928       break;
 929    }
 930    case fragment_fs: {
 931       declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
 932
 933       assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
 934       add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx);
 935
 936       add_arg(&args, s1, &ctx->program->scratch_offset, user_sgpr_info.user_sgpr_idx + 1);
 937
 938       ctx->program->config->spi_ps_input_addr = 0;
 939       ctx->program->config->spi_ps_input_ena = 0;
 940
 941       bool has_interp_mode = false;
 942
 943       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_sample_p1, S_0286CC_PERSP_SAMPLE_ENA(1), true);
 944       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
 945       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_centroid_p1, S_0286CC_PERSP_CENTROID_ENA(1), true);
 946       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_pull_model, S_0286CC_PERSP_PULL_MODEL_ENA(1), false, v3);
 947
 948       if (!has_interp_mode && ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
 949          /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
 950          ctx->fs_vgpr_args[fs_input::persp_center_p1] = true;
 951          has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
 952       }
 953
 954       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_sample_p1, S_0286CC_LINEAR_SAMPLE_ENA(1), true);
 955       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_center_p1, S_0286CC_LINEAR_CENTER_ENA(1), true);
 956       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_centroid_p1, S_0286CC_LINEAR_CENTROID_ENA(1), true);
 957       has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::line_stipple, S_0286CC_LINE_STIPPLE_TEX_ENA(1));
 958
 959       if (!has_interp_mode) {
 960          /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
 961          ctx->fs_vgpr_args[fs_input::persp_center_p1] = true;
 962          has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
 963       }
 964
 965       add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_0, S_0286CC_POS_X_FLOAT_ENA(1));
 966       add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_1, S_0286CC_POS_Y_FLOAT_ENA(1));
 967       add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_2, S_0286CC_POS_Z_FLOAT_ENA(1));
 968       add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_3, S_0286CC_POS_W_FLOAT_ENA(1));
 969
 970       add_fs_arg(ctx, &args, vgpr_idx, fs_input::front_face, S_0286CC_FRONT_FACE_ENA(1));
 971       add_fs_arg(ctx, &args, vgpr_idx, fs_input::ancillary, S_0286CC_ANCILLARY_ENA(1));
 972       add_fs_arg(ctx, &args, vgpr_idx, fs_input::sample_coverage, S_0286CC_SAMPLE_COVERAGE_ENA(1));
 973       add_fs_arg(ctx, &args, vgpr_idx, fs_input::fixed_pt, S_0286CC_POS_FIXED_PT_ENA(1));
 974
 975       ASSERTED bool unset_interp_mode = !(ctx->program->config->spi_ps_input_addr & 0x7F) ||
 976                                         (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_addr)
 977                                         && !(ctx->program->config->spi_ps_input_addr & 0xF));
 978
 979       assert(has_interp_mode);
 980       assert(!unset_interp_mode);
 981       break;
 982    }
 983    case compute_cs: {
 984       declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
 985
 986       if (ctx->program->info->cs.uses_grid_size) {
 987          add_arg(&args, s3, &ctx->num_workgroups, user_sgpr_info.user_sgpr_idx);
 988          set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_info.user_sgpr_idx, 3);
 989       }
 990       assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
 991       unsigned idx = user_sgpr_info.user_sgpr_idx;
 992       for (unsigned i = 0; i < 3; i++) {
 993          if (ctx->program->info->cs.uses_block_id[i])
 994             add_arg(&args, s1, &ctx->workgroup_ids[i], idx++);
 995       }
 996
 997       if (ctx->program->info->cs.uses_local_invocation_idx)
 998          add_arg(&args, s1, &ctx->tg_size, idx++);
 999       add_arg(&args, s1, &ctx->program->scratch_offset, idx++);
1000
1001       add_arg(&args, v3, &ctx->local_invocation_ids, vgpr_idx++);
1002       break;
1003    }
1004    default:
1005       unreachable("Shader stage not implemented");
1006    }
1007
1008    ctx->program->info->num_input_vgprs = 0;
1009    ctx->program->info->num_input_sgprs = args.num_sgprs_used;
1010    ctx->program->info->num_user_sgprs = user_sgpr_info.num_sgpr;
1011    ctx->program->info->num_input_vgprs = args.num_vgprs_used;
1012
1013    if (ctx->stage == fragment_fs) {
1014       /* Verify that we have a correct assumption about input VGPR count */
1015       ASSERTED unsigned input_vgpr_cnt = ac_get_fs_input_vgpr_cnt(ctx->program->config, nullptr, nullptr);
1016       assert(input_vgpr_cnt == ctx->program->info->num_input_vgprs);
1017    }
1018
1019    aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, args.count + 1)};
1020    for (unsigned i = 0; i < args.count; i++) {
1021       if (args.assign[i]) {
1022          *args.assign[i] = Temp{ctx->program->allocateId(), args.types[i]};
1023          startpgm->definitions[i] = Definition(*args.assign[i]);
1024          startpgm->definitions[i].setFixed(args.reg[i]);
1025       }
1026    }
1027    startpgm->definitions[args.count] = Definition{ctx->program->allocateId(), exec, s2};
1028    Pseudo_instruction *instr = startpgm.get();
1029    ctx->block->instructions.push_back(std::move(startpgm));
1030
1031    return instr;
1032 }
1033
1034 int
1035 type_size(const struct glsl_type *type, bool bindless)
1036 {
1037    // TODO: don't we need type->std430_base_alignment() here?
1038    return glsl_count_attribute_slots(type, false);
1039 }
1040
1041 void
1042 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1043 {
1044    assert(glsl_type_is_vector_or_scalar(type));
1045
1046    uint32_t comp_size = glsl_type_is_boolean(type)
1047       ? 4 : glsl_get_bit_size(type) / 8;
1048    unsigned length = glsl_get_vector_elements(type);
1049    *size = comp_size * length,
1050    *align = comp_size;
1051 }
1052
1053 int
1054 get_align(nir_variable_mode mode, bool is_store, unsigned bit_size, unsigned num_components)
1055 {
1056    /* TODO: ACO doesn't have good support for non-32-bit reads/writes yet */
1057    if (bit_size != 32)
1058       return -1;
1059
1060    switch (mode) {
1061    case nir_var_mem_ubo:
1062    case nir_var_mem_ssbo:
1063    //case nir_var_mem_push_const: enable with 1240!
1064    case nir_var_mem_shared:
1065       /* TODO: what are the alignment requirements for LDS? */
1066       return num_components <= 4 ? 4 : -1;
1067    default:
1068       return -1;
1069    }
1070 }
1071
1072 void
1073 setup_vs_variables(isel_context *ctx, nir_shader *nir)
1074 {
1075    nir_foreach_variable(variable, &nir->inputs)
1076    {
1077       variable->data.driver_location = variable->data.location * 4;
1078    }
1079    nir_foreach_variable(variable, &nir->outputs)
1080    {
1081       variable->data.driver_location = variable->data.location * 4;
1082    }
1083
1084    radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
1085
1086    memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1087           sizeof(outinfo->vs_output_param_offset));
1088
1089    ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id;
1090
1091    bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists;
1092
1093    outinfo->param_exports = 0;
1094    int pos_written = 0x1;
1095    if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
1096       pos_written |= 1 << 1;
1097
1098    nir_foreach_variable(variable, &nir->outputs)
1099    {
1100       int idx = variable->data.location;
1101       unsigned slots = variable->type->count_attribute_slots(false);
1102       if (variable->data.compact) {
1103          unsigned component_count = variable->data.location_frac + variable->type->length;
1104          slots = (component_count + 3) / 4;
1105       }
1106
1107       if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
1108           ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
1109          for (unsigned i = 0; i < slots; i++) {
1110             if (outinfo->vs_output_param_offset[idx + i] == AC_EXP_PARAM_UNDEFINED)
1111                outinfo->vs_output_param_offset[idx + i] = outinfo->param_exports++;
1112          }
1113       }
1114    }
1115    if (outinfo->writes_layer &&
1116        outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
1117       /* when ctx->options->key.has_multiview_view_index = true, the layer
1118        * variable isn't declared in NIR and it's isel's job to get the layer */
1119       outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
1120    }
1121
1122    if (outinfo->export_prim_id) {
1123       assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
1124       outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
1125    }
1126
1127    ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
1128    ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
1129
1130    assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
1131
1132    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
1133       pos_written |= 1 << 2;
1134    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
1135       pos_written |= 1 << 3;
1136
1137    outinfo->pos_exports = util_bitcount(pos_written);
1138 }
1139
1140 void
1141 setup_variables(isel_context *ctx, nir_shader *nir)
1142 {
1143    switch (nir->info.stage) {
1144    case MESA_SHADER_FRAGMENT: {
1145       nir_foreach_variable(variable, &nir->outputs)
1146       {
1147          int idx = variable->data.location + variable->data.index;
1148          variable->data.driver_location = idx * 4;
1149       }
1150       break;
1151    }
1152    case MESA_SHADER_COMPUTE: {
1153       ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) /
1154                                        ctx->program->lds_alloc_granule;
1155       break;
1156    }
1157    case MESA_SHADER_VERTEX: {
1158       setup_vs_variables(ctx, nir);
1159       break;
1160    }
1161    default:
1162       unreachable("Unhandled shader stage.");
1163    }
1164 }
1165
1166 isel_context
1167 setup_isel_context(Program* program,
1168                    unsigned shader_count,
1169                    struct nir_shader *const *shaders,
1170                    ac_shader_config* config,
1171                    radv_shader_info *info,
1172                    const radv_nir_compiler_options *options)
1173 {
1174    program->stage = 0;
1175    for (unsigned i = 0; i < shader_count; i++) {
1176       switch (shaders[i]->info.stage) {
1177       case MESA_SHADER_VERTEX:
1178          program->stage |= sw_vs;
1179          break;
1180       case MESA_SHADER_TESS_CTRL:
1181          program->stage |= sw_tcs;
1182          break;
1183       case MESA_SHADER_TESS_EVAL:
1184          program->stage |= sw_tes;
1185          break;
1186       case MESA_SHADER_GEOMETRY:
1187          program->stage |= sw_gs;
1188          break;
1189       case MESA_SHADER_FRAGMENT:
1190          program->stage |= sw_fs;
1191          break;
1192       case MESA_SHADER_COMPUTE:
1193          program->stage |= sw_cs;
1194          break;
1195       default:
1196          unreachable("Shader stage not implemented");
1197       }
1198    }
1199    if (program->stage == sw_vs)
1200       program->stage |= hw_vs;
1201    else if (program->stage == sw_fs)
1202       program->stage |= hw_fs;
1203    else if (program->stage == sw_cs)
1204       program->stage |= hw_cs;
1205    else
1206       unreachable("Shader stage not implemented");
1207
1208    program->config = config;
1209    program->info = info;
1210    program->chip_class = options->chip_class;
1211    program->family = options->family;
1212    program->wave_size = info->wave_size;
1213
1214    program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256;
1215    program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768;
1216    program->vgpr_limit = 256;
1217
1218    if (options->chip_class >= GFX10) {
1219       program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
1220       program->sgpr_alloc_granule = 127;
1221       program->sgpr_limit = 106;
1222    } else if (program->chip_class >= GFX8) {
1223       program->physical_sgprs = 800;
1224       program->sgpr_alloc_granule = 15;
1225       if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
1226          program->sgpr_limit = 94; /* workaround hardware bug */
1227       else
1228          program->sgpr_limit = 102;
1229    } else {
1230       program->physical_sgprs = 512;
1231       program->sgpr_alloc_granule = 7;
1232       program->sgpr_limit = 104;
1233    }
1234    /* TODO: we don't have to allocate VCC if we don't need it */
1235    program->needs_vcc = true;
1236
1237    for (unsigned i = 0; i < MAX_SETS; ++i)
1238       program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
1239    for (unsigned i = 0; i < AC_UD_MAX_UD; ++i)
1240       program->info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
1241
1242    isel_context ctx = {};
1243    ctx.program = program;
1244    ctx.options = options;
1245    ctx.stage = program->stage;
1246
1247    for (unsigned i = 0; i < fs_input::max_inputs; ++i)
1248       ctx.fs_inputs[i] = Temp(0, v1);
1249    ctx.fs_inputs[fs_input::persp_pull_model] = Temp(0, v3);
1250    for (unsigned i = 0; i < MAX_SETS; ++i)
1251       ctx.descriptor_sets[i] = Temp(0, s1);
1252    for (unsigned i = 0; i < MAX_INLINE_PUSH_CONSTS; ++i)
1253       ctx.inline_push_consts[i] = Temp(0, s1);
1254    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
1255       for (unsigned j = 0; j < 4; ++j)
1256          ctx.vs_output.outputs[i][j] = Temp(0, v1);
1257    }
1258
1259    for (unsigned i = 0; i < shader_count; i++) {
1260       nir_shader *nir = shaders[i];
1261
1262       /* align and copy constant data */
1263       while (program->constant_data.size() % 4u)
1264          program->constant_data.push_back(0);
1265       ctx.constant_data_offset = program->constant_data.size();
1266       program->constant_data.insert(program->constant_data.end(),
1267                                     (uint8_t*)nir->constant_data,
1268                                     (uint8_t*)nir->constant_data + nir->constant_data_size);
1269
1270       /* the variable setup has to be done before lower_io / CSE */
1271       if (nir->info.stage == MESA_SHADER_COMPUTE)
1272          nir_lower_vars_to_explicit_types(nir, nir_var_mem_shared, shared_var_info);
1273       setup_variables(&ctx, nir);
1274
1275       /* optimize and lower memory operations */
1276       bool lower_to_scalar = false;
1277       bool lower_pack = false;
1278       // TODO: uncomment this once !1240 is merged
1279       /*if (nir_opt_load_store_vectorize(nir,
1280                                        (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
1281                                                            nir_var_mem_push_const | nir_var_mem_shared),
1282                                        get_align)) {
1283          lower_to_scalar = true;
1284          lower_pack = true;
1285       }*/
1286       if (nir->info.stage == MESA_SHADER_COMPUTE)
1287          lower_to_scalar |= nir_lower_explicit_io(nir, nir_var_mem_shared, nir_address_format_32bit_offset);
1288       else
1289          nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
1290       nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
1291
1292       if (lower_to_scalar)
1293          nir_lower_alu_to_scalar(nir, NULL, NULL);
1294       if (lower_pack)
1295          nir_lower_pack(nir);
1296
1297       /* lower ALU operations */
1298       // TODO: implement logic64 in aco, it's more effective for sgprs
1299       nir_lower_int64(nir, nir->options->lower_int64_options);
1300
1301       nir_opt_idiv_const(nir, 32);
1302       nir_lower_idiv(nir, nir_lower_idiv_precise);
1303
1304       /* optimize the lowered ALU operations */
1305       bool more_algebraic = true;
1306       while (more_algebraic) {
1307          more_algebraic = false;
1308          NIR_PASS_V(nir, nir_copy_prop);
1309          NIR_PASS_V(nir, nir_opt_dce);
1310          NIR_PASS_V(nir, nir_opt_constant_folding);
1311          NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
1312       }
1313
1314       /* Do late algebraic optimization to turn add(a, neg(b)) back into
1315       * subs, then the mandatory cleanup after algebraic.  Note that it may
1316       * produce fnegs, and if so then we need to keep running to squash
1317       * fneg(fneg(a)).
1318       */
1319       bool more_late_algebraic = true;
1320       while (more_late_algebraic) {
1321          more_late_algebraic = false;
1322          NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
1323          NIR_PASS_V(nir, nir_opt_constant_folding);
1324          NIR_PASS_V(nir, nir_copy_prop);
1325          NIR_PASS_V(nir, nir_opt_dce);
1326          NIR_PASS_V(nir, nir_opt_cse);
1327       }
1328
1329       /* cleanup passes */
1330       nir_lower_load_const_to_scalar(nir);
1331       nir_opt_shrink_load(nir);
1332       nir_move_options move_opts = (nir_move_options)(
1333          nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons);
1334       nir_opt_sink(nir, move_opts);
1335       nir_opt_move(nir, move_opts);
1336       nir_convert_to_lcssa(nir, true, false);
1337       nir_lower_phis_to_scalar(nir);
1338
1339       nir_function_impl *func = nir_shader_get_entrypoint(nir);
1340       nir_index_ssa_defs(func);
1341
1342       if (options->dump_preoptir) {
1343          fprintf(stderr, "NIR shader before instruction selection:\n");
1344          nir_print_shader(nir, stderr);
1345       }
1346    }
1347
1348    unsigned scratch_size = 0;
1349    for (unsigned i = 0; i < shader_count; i++)
1350       scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
1351    ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);
1352
1353    ctx.block = ctx.program->create_and_insert_block();
1354    ctx.block->loop_nest_depth = 0;
1355    ctx.block->kind = block_kind_top_level;
1356
1357    return ctx;
1358 }
1359
1360 }