src/amd/compiler/aco_instruction_selection_setup.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <array>
  26 #include <unordered_map>
  27 #include "aco_ir.h"
  28 #include "nir.h"
  29 #include "nir_control_flow.h"
  30 #include "vulkan/radv_shader.h"
  31 #include "vulkan/radv_descriptor_set.h"
  32 #include "vulkan/radv_shader_args.h"
  33 #include "sid.h"
  34 #include "ac_exp_param.h"
  35 #include "ac_shader_util.h"
  36
  37 #include "util/u_math.h"
  38
  39 #define MAX_INLINE_PUSH_CONSTS 8
  40
  41 namespace aco {
  42
  43 struct shader_io_state {
  44    uint8_t mask[VARYING_SLOT_MAX];
  45    Temp temps[VARYING_SLOT_MAX * 4u];
  46
  47    shader_io_state() {
  48       memset(mask, 0, sizeof(mask));
  49       std::fill_n(temps, VARYING_SLOT_MAX * 4u, Temp(0, RegClass::v1));
  50    }
  51 };
  52
  53 struct isel_context {
  54    const struct radv_nir_compiler_options *options;
  55    struct radv_shader_args *args;
  56    Program *program;
  57    nir_shader *shader;
  58    uint32_t constant_data_offset;
  59    Block *block;
  60    bool *divergent_vals;
  61    std::unique_ptr<Temp[]> allocated;
  62    std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
  63    Stage stage; /* Stage */
  64    bool has_gfx10_wave64_bpermute = false;
  65    struct {
  66       bool has_branch;
  67       uint16_t loop_nest_depth = 0;
  68       struct {
  69          unsigned header_idx;
  70          Block* exit;
  71          bool has_divergent_continue = false;
  72          bool has_divergent_branch = false;
  73       } parent_loop;
  74       struct {
  75          bool is_divergent = false;
  76       } parent_if;
  77       bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
  78       uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
  79       /* Set to false when loop_nest_depth==exec_potentially_empty_break_depth
  80        * and parent_if.is_divergent==false. Called _break but it's also used for
  81        * loop continues. */
  82       bool exec_potentially_empty_break = false;
  83       std::unique_ptr<unsigned[]> nir_to_aco; /* NIR block index to ACO block index */
  84    } cf_info;
  85
  86    Temp arg_temps[AC_MAX_ARGS];
  87
  88    /* FS inputs */
  89    Temp persp_centroid, linear_centroid;
  90
  91    /* GS inputs */
  92    Temp gs_wave_id;
  93
  94    /* gathered information */
  95    uint64_t input_masks[MESA_SHADER_COMPUTE];
  96    uint64_t output_masks[MESA_SHADER_COMPUTE];
  97
  98    /* VS output information */
  99    bool export_clip_dists;
 100    unsigned num_clip_distances;
 101    unsigned num_cull_distances;
 102
 103    /* tessellation information */
 104    unsigned tcs_tess_lvl_out_loc;
 105    unsigned tcs_tess_lvl_in_loc;
 106    uint32_t tcs_num_inputs;
 107    uint32_t tcs_num_patches;
 108    bool tcs_in_out_eq = false;
 109
 110    /* I/O information */
 111    shader_io_state inputs;
 112    shader_io_state outputs;
 113 };
 114
 115 Temp get_arg(isel_context *ctx, struct ac_arg arg)
 116 {
 117    assert(arg.used);
 118    return ctx->arg_temps[arg.arg_index];
 119 }
 120
 121 unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
 122 {
 123    switch (interp) {
 124    case INTERP_MODE_SMOOTH:
 125    case INTERP_MODE_NONE:
 126       if (intrin == nir_intrinsic_load_barycentric_pixel ||
 127           intrin == nir_intrinsic_load_barycentric_at_sample ||
 128           intrin == nir_intrinsic_load_barycentric_at_offset)
 129          return S_0286CC_PERSP_CENTER_ENA(1);
 130       else if (intrin == nir_intrinsic_load_barycentric_centroid)
 131          return S_0286CC_PERSP_CENTROID_ENA(1);
 132       else if (intrin == nir_intrinsic_load_barycentric_sample)
 133          return S_0286CC_PERSP_SAMPLE_ENA(1);
 134       break;
 135    case INTERP_MODE_NOPERSPECTIVE:
 136       if (intrin == nir_intrinsic_load_barycentric_pixel)
 137          return S_0286CC_LINEAR_CENTER_ENA(1);
 138       else if (intrin == nir_intrinsic_load_barycentric_centroid)
 139          return S_0286CC_LINEAR_CENTROID_ENA(1);
 140       else if (intrin == nir_intrinsic_load_barycentric_sample)
 141          return S_0286CC_LINEAR_SAMPLE_ENA(1);
 142       break;
 143    default:
 144       break;
 145    }
 146    return 0;
 147 }
 148
 149 /* If one side of a divergent IF ends in a branch and the other doesn't, we
 150  * might have to emit the contents of the side without the branch at the merge
 151  * block instead. This is so that we can use any SGPR live-out of the side
 152  * without the branch without creating a linear phi in the invert or merge block. */
 153 bool
 154 sanitize_if(nir_function_impl *impl, bool *divergent, nir_if *nif)
 155 {
 156    if (!divergent[nif->condition.ssa->index])
 157       return false;
 158
 159    nir_block *then_block = nir_if_last_then_block(nif);
 160    nir_block *else_block = nir_if_last_else_block(nif);
 161    bool then_jump = nir_block_ends_in_jump(then_block) || nir_block_is_unreachable(then_block);
 162    bool else_jump = nir_block_ends_in_jump(else_block) || nir_block_is_unreachable(else_block);
 163    if (then_jump == else_jump)
 164       return false;
 165
 166    /* If the continue from block is empty then return as there is nothing to
 167     * move.
 168     */
 169    if (nir_cf_list_is_empty_block(else_jump ? &nif->then_list : &nif->else_list))
 170       return false;
 171
 172    /* Even though this if statement has a jump on one side, we may still have
 173     * phis afterwards.  Single-source phis can be produced by loop unrolling
 174     * or dead control-flow passes and are perfectly legal.  Run a quick phi
 175     * removal on the block after the if to clean up any such phis.
 176     */
 177    nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
 178
 179    /* Finally, move the continue from branch after the if-statement. */
 180    nir_block *last_continue_from_blk = else_jump ? then_block : else_block;
 181    nir_block *first_continue_from_blk = else_jump ?
 182       nir_if_first_then_block(nif) : nir_if_first_else_block(nif);
 183
 184    nir_cf_list tmp;
 185    nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
 186                         nir_after_block(last_continue_from_blk));
 187    nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
 188
 189    /* nir_cf_extract() invalidates dominance metadata, but it should still be
 190     * correct because of the specific type of transformation we did. Block
 191     * indices are not valid except for block_0's, which is all we care about for
 192     * nir_block_is_unreachable(). */
 193    impl->valid_metadata =
 194       (nir_metadata)(impl->valid_metadata | nir_metadata_dominance | nir_metadata_block_index);
 195
 196    return true;
 197 }
 198
 199 bool
 200 sanitize_cf_list(nir_function_impl *impl, bool *divergent, struct exec_list *cf_list)
 201 {
 202    bool progress = false;
 203    foreach_list_typed(nir_cf_node, cf_node, node, cf_list) {
 204       switch (cf_node->type) {
 205       case nir_cf_node_block:
 206          break;
 207       case nir_cf_node_if: {
 208          nir_if *nif = nir_cf_node_as_if(cf_node);
 209          progress |= sanitize_cf_list(impl, divergent, &nif->then_list);
 210          progress |= sanitize_cf_list(impl, divergent, &nif->else_list);
 211          progress |= sanitize_if(impl, divergent, nif);
 212          break;
 213       }
 214       case nir_cf_node_loop: {
 215          nir_loop *loop = nir_cf_node_as_loop(cf_node);
 216          progress |= sanitize_cf_list(impl, divergent, &loop->body);
 217          break;
 218       }
 219       case nir_cf_node_function:
 220          unreachable("Invalid cf type");
 221       }
 222    }
 223
 224    return progress;
 225 }
 226
 227 void init_context(isel_context *ctx, nir_shader *shader)
 228 {
 229    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 230    unsigned lane_mask_size = ctx->program->lane_mask.size();
 231
 232    ctx->shader = shader;
 233    ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
 234
 235    /* sanitize control flow */
 236    nir_metadata_require(impl, nir_metadata_dominance);
 237    sanitize_cf_list(impl, ctx->divergent_vals, &impl->body);
 238    nir_metadata_preserve(impl, (nir_metadata)~nir_metadata_block_index);
 239
 240    /* we'll need this for isel */
 241    nir_metadata_require(impl, nir_metadata_block_index);
 242
 243    if (!(ctx->stage & sw_gs_copy) && ctx->options->dump_preoptir) {
 244       fprintf(stderr, "NIR shader before instruction selection:\n");
 245       nir_print_shader(shader, stderr);
 246    }
 247
 248    std::unique_ptr<Temp[]> allocated{new Temp[impl->ssa_alloc]()};
 249
 250    unsigned spi_ps_inputs = 0;
 251
 252    std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};
 253
 254    bool done = false;
 255    while (!done) {
 256       done = true;
 257       nir_foreach_block(block, impl) {
 258          nir_foreach_instr(instr, block) {
 259             switch(instr->type) {
 260             case nir_instr_type_alu: {
 261                nir_alu_instr *alu_instr = nir_instr_as_alu(instr);
 262                unsigned size =  alu_instr->dest.dest.ssa.num_components;
 263                if (alu_instr->dest.dest.ssa.bit_size == 64)
 264                   size *= 2;
 265                RegType type = RegType::sgpr;
 266                switch(alu_instr->op) {
 267                   case nir_op_fmul:
 268                   case nir_op_fadd:
 269                   case nir_op_fsub:
 270                   case nir_op_fmax:
 271                   case nir_op_fmin:
 272                   case nir_op_fmax3:
 273                   case nir_op_fmin3:
 274                   case nir_op_fmed3:
 275                   case nir_op_fneg:
 276                   case nir_op_fabs:
 277                   case nir_op_fsat:
 278                   case nir_op_fsign:
 279                   case nir_op_frcp:
 280                   case nir_op_frsq:
 281                   case nir_op_fsqrt:
 282                   case nir_op_fexp2:
 283                   case nir_op_flog2:
 284                   case nir_op_ffract:
 285                   case nir_op_ffloor:
 286                   case nir_op_fceil:
 287                   case nir_op_ftrunc:
 288                   case nir_op_fround_even:
 289                   case nir_op_fsin:
 290                   case nir_op_fcos:
 291                   case nir_op_f2f32:
 292                   case nir_op_f2f64:
 293                   case nir_op_u2f32:
 294                   case nir_op_u2f64:
 295                   case nir_op_i2f32:
 296                   case nir_op_i2f64:
 297                   case nir_op_pack_half_2x16:
 298                   case nir_op_unpack_half_2x16_split_x:
 299                   case nir_op_unpack_half_2x16_split_y:
 300                   case nir_op_fddx:
 301                   case nir_op_fddy:
 302                   case nir_op_fddx_fine:
 303                   case nir_op_fddy_fine:
 304                   case nir_op_fddx_coarse:
 305                   case nir_op_fddy_coarse:
 306                   case nir_op_fquantize2f16:
 307                   case nir_op_ldexp:
 308                   case nir_op_frexp_sig:
 309                   case nir_op_frexp_exp:
 310                   case nir_op_cube_face_index:
 311                   case nir_op_cube_face_coord:
 312                      type = RegType::vgpr;
 313                      break;
 314                   case nir_op_flt:
 315                   case nir_op_fge:
 316                   case nir_op_feq:
 317                   case nir_op_fne:
 318                   case nir_op_ilt:
 319                   case nir_op_ige:
 320                   case nir_op_ult:
 321                   case nir_op_uge:
 322                   case nir_op_ieq:
 323                   case nir_op_ine:
 324                   case nir_op_i2b1:
 325                      size = lane_mask_size;
 326                      break;
 327                   case nir_op_f2i64:
 328                   case nir_op_f2u64:
 329                   case nir_op_b2i32:
 330                   case nir_op_b2f32:
 331                   case nir_op_f2i32:
 332                   case nir_op_f2u32:
 333                      type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
 334                      break;
 335                   case nir_op_bcsel:
 336                      if (alu_instr->dest.dest.ssa.bit_size == 1) {
 337                         size = lane_mask_size;
 338                      } else {
 339                         if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) {
 340                            type = RegType::vgpr;
 341                         } else {
 342                            if (allocated[alu_instr->src[1].src.ssa->index].type() == RegType::vgpr ||
 343                                allocated[alu_instr->src[2].src.ssa->index].type() == RegType::vgpr) {
 344                               type = RegType::vgpr;
 345                            }
 346                         }
 347                         if (alu_instr->src[1].src.ssa->num_components == 1 && alu_instr->src[2].src.ssa->num_components == 1) {
 348                            assert(allocated[alu_instr->src[1].src.ssa->index].size() == allocated[alu_instr->src[2].src.ssa->index].size());
 349                            size = allocated[alu_instr->src[1].src.ssa->index].size();
 350                         }
 351                      }
 352                      break;
 353                   case nir_op_mov:
 354                      if (alu_instr->dest.dest.ssa.bit_size == 1) {
 355                         size = lane_mask_size;
 356                      } else {
 357                         type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
 358                      }
 359                      break;
 360                   default:
 361                      if (alu_instr->dest.dest.ssa.bit_size == 1) {
 362                         size = lane_mask_size;
 363                      } else {
 364                         for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
 365                            if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
 366                               type = RegType::vgpr;
 367                         }
 368                      }
 369                      break;
 370                }
 371                allocated[alu_instr->dest.dest.ssa.index] = Temp(0, RegClass(type, size));
 372                break;
 373             }
 374             case nir_instr_type_load_const: {
 375                unsigned size = nir_instr_as_load_const(instr)->def.num_components;
 376                if (nir_instr_as_load_const(instr)->def.bit_size == 64)
 377                   size *= 2;
 378                else if (nir_instr_as_load_const(instr)->def.bit_size == 1)
 379                   size *= lane_mask_size;
 380                allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
 381                break;
 382             }
 383             case nir_instr_type_intrinsic: {
 384                nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
 385                if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
 386                   break;
 387                unsigned size =  intrinsic->dest.ssa.num_components;
 388                if (intrinsic->dest.ssa.bit_size == 64)
 389                   size *= 2;
 390                RegType type = RegType::sgpr;
 391                switch(intrinsic->intrinsic) {
 392                   case nir_intrinsic_load_push_constant:
 393                   case nir_intrinsic_load_work_group_id:
 394                   case nir_intrinsic_load_num_work_groups:
 395                   case nir_intrinsic_load_subgroup_id:
 396                   case nir_intrinsic_load_num_subgroups:
 397                   case nir_intrinsic_load_first_vertex:
 398                   case nir_intrinsic_load_base_instance:
 399                   case nir_intrinsic_get_buffer_size:
 400                   case nir_intrinsic_vote_all:
 401                   case nir_intrinsic_vote_any:
 402                   case nir_intrinsic_read_first_invocation:
 403                   case nir_intrinsic_read_invocation:
 404                   case nir_intrinsic_first_invocation:
 405                      type = RegType::sgpr;
 406                      if (intrinsic->dest.ssa.bit_size == 1)
 407                         size = lane_mask_size;
 408                      break;
 409                   case nir_intrinsic_ballot:
 410                      type = RegType::sgpr;
 411                      break;
 412                   case nir_intrinsic_load_sample_id:
 413                   case nir_intrinsic_load_sample_mask_in:
 414                   case nir_intrinsic_load_input:
 415                   case nir_intrinsic_load_output:
 416                   case nir_intrinsic_load_input_vertex:
 417                   case nir_intrinsic_load_per_vertex_input:
 418                   case nir_intrinsic_load_per_vertex_output:
 419                   case nir_intrinsic_load_vertex_id:
 420                   case nir_intrinsic_load_vertex_id_zero_base:
 421                   case nir_intrinsic_load_barycentric_sample:
 422                   case nir_intrinsic_load_barycentric_pixel:
 423                   case nir_intrinsic_load_barycentric_model:
 424                   case nir_intrinsic_load_barycentric_centroid:
 425                   case nir_intrinsic_load_barycentric_at_sample:
 426                   case nir_intrinsic_load_barycentric_at_offset:
 427                   case nir_intrinsic_load_interpolated_input:
 428                   case nir_intrinsic_load_frag_coord:
 429                   case nir_intrinsic_load_sample_pos:
 430                   case nir_intrinsic_load_layer_id:
 431                   case nir_intrinsic_load_local_invocation_id:
 432                   case nir_intrinsic_load_local_invocation_index:
 433                   case nir_intrinsic_load_subgroup_invocation:
 434                   case nir_intrinsic_load_tess_coord:
 435                   case nir_intrinsic_write_invocation_amd:
 436                   case nir_intrinsic_mbcnt_amd:
 437                   case nir_intrinsic_load_instance_id:
 438                   case nir_intrinsic_ssbo_atomic_add:
 439                   case nir_intrinsic_ssbo_atomic_imin:
 440                   case nir_intrinsic_ssbo_atomic_umin:
 441                   case nir_intrinsic_ssbo_atomic_imax:
 442                   case nir_intrinsic_ssbo_atomic_umax:
 443                   case nir_intrinsic_ssbo_atomic_and:
 444                   case nir_intrinsic_ssbo_atomic_or:
 445                   case nir_intrinsic_ssbo_atomic_xor:
 446                   case nir_intrinsic_ssbo_atomic_exchange:
 447                   case nir_intrinsic_ssbo_atomic_comp_swap:
 448                   case nir_intrinsic_global_atomic_add:
 449                   case nir_intrinsic_global_atomic_imin:
 450                   case nir_intrinsic_global_atomic_umin:
 451                   case nir_intrinsic_global_atomic_imax:
 452                   case nir_intrinsic_global_atomic_umax:
 453                   case nir_intrinsic_global_atomic_and:
 454                   case nir_intrinsic_global_atomic_or:
 455                   case nir_intrinsic_global_atomic_xor:
 456                   case nir_intrinsic_global_atomic_exchange:
 457                   case nir_intrinsic_global_atomic_comp_swap:
 458                   case nir_intrinsic_image_deref_atomic_add:
 459                   case nir_intrinsic_image_deref_atomic_umin:
 460                   case nir_intrinsic_image_deref_atomic_imin:
 461                   case nir_intrinsic_image_deref_atomic_umax:
 462                   case nir_intrinsic_image_deref_atomic_imax:
 463                   case nir_intrinsic_image_deref_atomic_and:
 464                   case nir_intrinsic_image_deref_atomic_or:
 465                   case nir_intrinsic_image_deref_atomic_xor:
 466                   case nir_intrinsic_image_deref_atomic_exchange:
 467                   case nir_intrinsic_image_deref_atomic_comp_swap:
 468                   case nir_intrinsic_image_deref_size:
 469                   case nir_intrinsic_shared_atomic_add:
 470                   case nir_intrinsic_shared_atomic_imin:
 471                   case nir_intrinsic_shared_atomic_umin:
 472                   case nir_intrinsic_shared_atomic_imax:
 473                   case nir_intrinsic_shared_atomic_umax:
 474                   case nir_intrinsic_shared_atomic_and:
 475                   case nir_intrinsic_shared_atomic_or:
 476                   case nir_intrinsic_shared_atomic_xor:
 477                   case nir_intrinsic_shared_atomic_exchange:
 478                   case nir_intrinsic_shared_atomic_comp_swap:
 479                   case nir_intrinsic_load_scratch:
 480                   case nir_intrinsic_load_invocation_id:
 481                   case nir_intrinsic_load_primitive_id:
 482                      type = RegType::vgpr;
 483                      break;
 484                   case nir_intrinsic_shuffle:
 485                   case nir_intrinsic_quad_broadcast:
 486                   case nir_intrinsic_quad_swap_horizontal:
 487                   case nir_intrinsic_quad_swap_vertical:
 488                   case nir_intrinsic_quad_swap_diagonal:
 489                   case nir_intrinsic_quad_swizzle_amd:
 490                   case nir_intrinsic_masked_swizzle_amd:
 491                   case nir_intrinsic_inclusive_scan:
 492                   case nir_intrinsic_exclusive_scan:
 493                      if (intrinsic->dest.ssa.bit_size == 1) {
 494                         size = lane_mask_size;
 495                         type = RegType::sgpr;
 496                      } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
 497                         type = RegType::sgpr;
 498                      } else {
 499                         type = RegType::vgpr;
 500                      }
 501                      break;
 502                   case nir_intrinsic_load_view_index:
 503                      type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
 504                      break;
 505                   case nir_intrinsic_load_front_face:
 506                   case nir_intrinsic_load_helper_invocation:
 507                   case nir_intrinsic_is_helper_invocation:
 508                      type = RegType::sgpr;
 509                      size = lane_mask_size;
 510                      break;
 511                   case nir_intrinsic_reduce:
 512                      if (intrinsic->dest.ssa.bit_size == 1) {
 513                         size = lane_mask_size;
 514                         type = RegType::sgpr;
 515                      } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
 516                         type = RegType::sgpr;
 517                      } else {
 518                         type = RegType::vgpr;
 519                      }
 520                      break;
 521                   case nir_intrinsic_load_ubo:
 522                   case nir_intrinsic_load_ssbo:
 523                   case nir_intrinsic_load_global:
 524                   case nir_intrinsic_vulkan_resource_index:
 525                      type = ctx->divergent_vals[intrinsic->dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
 526                      break;
 527                   /* due to copy propagation, the swizzled imov is removed if num dest components == 1 */
 528                   case nir_intrinsic_load_shared:
 529                      if (ctx->divergent_vals[intrinsic->dest.ssa.index])
 530                         type = RegType::vgpr;
 531                      else
 532                         type = RegType::sgpr;
 533                      break;
 534                   default:
 535                      for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) {
 536                         if (allocated[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
 537                            type = RegType::vgpr;
 538                      }
 539                      break;
 540                }
 541                allocated[intrinsic->dest.ssa.index] = Temp(0, RegClass(type, size));
 542
 543                switch(intrinsic->intrinsic) {
 544                   case nir_intrinsic_load_barycentric_sample:
 545                   case nir_intrinsic_load_barycentric_pixel:
 546                   case nir_intrinsic_load_barycentric_centroid:
 547                   case nir_intrinsic_load_barycentric_at_sample:
 548                   case nir_intrinsic_load_barycentric_at_offset: {
 549                      glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
 550                      spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
 551                      break;
 552                   }
 553                   case nir_intrinsic_load_barycentric_model:
 554                      spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);
 555                      break;
 556                   case nir_intrinsic_load_front_face:
 557                      spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
 558                      break;
 559                   case nir_intrinsic_load_frag_coord:
 560                   case nir_intrinsic_load_sample_pos: {
 561                      uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
 562                      for (unsigned i = 0; i < 4; i++) {
 563                         if (mask & (1 << i))
 564                            spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
 565
 566                      }
 567                      break;
 568                   }
 569                   case nir_intrinsic_load_sample_id:
 570                      spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
 571                      break;
 572                   case nir_intrinsic_load_sample_mask_in:
 573                      spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
 574                      spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
 575                      break;
 576                   default:
 577                      break;
 578                }
 579                break;
 580             }
 581             case nir_instr_type_tex: {
 582                nir_tex_instr* tex = nir_instr_as_tex(instr);
 583                unsigned size = tex->dest.ssa.num_components;
 584
 585                if (tex->dest.ssa.bit_size == 64)
 586                   size *= 2;
 587                if (tex->op == nir_texop_texture_samples)
 588                   assert(!ctx->divergent_vals[tex->dest.ssa.index]);
 589                if (ctx->divergent_vals[tex->dest.ssa.index])
 590                   allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::vgpr, size));
 591                else
 592                   allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::sgpr, size));
 593                break;
 594             }
 595             case nir_instr_type_parallel_copy: {
 596                nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) {
 597                   allocated[entry->dest.ssa.index] = allocated[entry->src.ssa->index];
 598                }
 599                break;
 600             }
 601             case nir_instr_type_ssa_undef: {
 602                unsigned size = nir_instr_as_ssa_undef(instr)->def.num_components;
 603                if (nir_instr_as_ssa_undef(instr)->def.bit_size == 64)
 604                   size *= 2;
 605                else if (nir_instr_as_ssa_undef(instr)->def.bit_size == 1)
 606                   size *= lane_mask_size;
 607                allocated[nir_instr_as_ssa_undef(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
 608                break;
 609             }
 610             case nir_instr_type_phi: {
 611                nir_phi_instr* phi = nir_instr_as_phi(instr);
 612                RegType type;
 613                unsigned size = phi->dest.ssa.num_components;
 614
 615                if (phi->dest.ssa.bit_size == 1) {
 616                   assert(size == 1 && "multiple components not yet supported on boolean phis.");
 617                   type = RegType::sgpr;
 618                   size *= lane_mask_size;
 619                   allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size));
 620                   break;
 621                }
 622
 623                if (ctx->divergent_vals[phi->dest.ssa.index]) {
 624                   type = RegType::vgpr;
 625                } else {
 626                   type = RegType::sgpr;
 627                   nir_foreach_phi_src (src, phi) {
 628                      if (allocated[src->src.ssa->index].type() == RegType::vgpr)
 629                         type = RegType::vgpr;
 630                      if (allocated[src->src.ssa->index].type() == RegType::none)
 631                         done = false;
 632                   }
 633                }
 634
 635                size *= phi->dest.ssa.bit_size == 64 ? 2 : 1;
 636                RegClass rc = RegClass(type, size);
 637                if (rc != allocated[phi->dest.ssa.index].regClass()) {
 638                   done = false;
 639                } else {
 640                   nir_foreach_phi_src(src, phi)
 641                      assert(allocated[src->src.ssa->index].size() == rc.size());
 642                }
 643                allocated[phi->dest.ssa.index] = Temp(0, rc);
 644                break;
 645             }
 646             default:
 647                break;
 648             }
 649          }
 650       }
 651    }
 652
 653    if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) {
 654       /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
 655       spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
 656    }
 657
 658    if (!(spi_ps_inputs & 0x7F)) {
 659       /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
 660       spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
 661    }
 662
 663    ctx->program->config->spi_ps_input_ena = spi_ps_inputs;
 664    ctx->program->config->spi_ps_input_addr = spi_ps_inputs;
 665
 666    for (unsigned i = 0; i < impl->ssa_alloc; i++)
 667       allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass());
 668
 669    ctx->allocated.reset(allocated.release());
 670    ctx->cf_info.nir_to_aco.reset(nir_to_aco.release());
 671 }
 672
 673 Pseudo_instruction *add_startpgm(struct isel_context *ctx)
 674 {
 675    unsigned arg_count = ctx->args->ac.arg_count;
 676    if (ctx->stage == fragment_fs) {
 677       /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr
 678        * itself and then communicates the results back via the ELF binary.
 679        * Mirror what LLVM does by re-mapping the VGPR arguments here.
 680        *
 681        * TODO: If we made the FS input scanning code into a separate pass that
 682        * could run before argument setup, then this wouldn't be necessary
 683        * anymore.
 684        */
 685       struct ac_shader_args *args = &ctx->args->ac;
 686       arg_count = 0;
 687       for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {
 688          if (args->args[i].file != AC_ARG_VGPR) {
 689             arg_count++;
 690             continue;
 691          }
 692
 693          if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) {
 694             args->args[i].skip = true;
 695          } else {
 696             args->args[i].offset = vgpr_reg;
 697             vgpr_reg += args->args[i].size;
 698             arg_count++;
 699          }
 700          vgpr_arg++;
 701       }
 702    }
 703
 704    aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count + 1)};
 705    for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
 706       if (ctx->args->ac.args[i].skip)
 707          continue;
 708
 709       enum ac_arg_regfile file = ctx->args->ac.args[i].file;
 710       unsigned size = ctx->args->ac.args[i].size;
 711       unsigned reg = ctx->args->ac.args[i].offset;
 712       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
 713       Temp dst = Temp{ctx->program->allocateId(), type};
 714       ctx->arg_temps[i] = dst;
 715       startpgm->definitions[arg] = Definition(dst);
 716       startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
 717       arg++;
 718    }
 719    startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, ctx->program->lane_mask};
 720    Pseudo_instruction *instr = startpgm.get();
 721    ctx->block->instructions.push_back(std::move(startpgm));
 722
 723    /* Stash these in the program so that they can be accessed later when
 724     * handling spilling.
 725     */
 726    ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
 727    ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
 728
 729    return instr;
 730 }
 731
 732 int
 733 type_size(const struct glsl_type *type, bool bindless)
 734 {
 735    // TODO: don't we need type->std430_base_alignment() here?
 736    return glsl_count_attribute_slots(type, false);
 737 }
 738
 739 void
 740 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 741 {
 742    assert(glsl_type_is_vector_or_scalar(type));
 743
 744    uint32_t comp_size = glsl_type_is_boolean(type)
 745       ? 4 : glsl_get_bit_size(type) / 8;
 746    unsigned length = glsl_get_vector_elements(type);
 747    *size = comp_size * length,
 748    *align = comp_size;
 749 }
 750
 751 static bool
 752 mem_vectorize_callback(unsigned align, unsigned bit_size,
 753                        unsigned num_components, unsigned high_offset,
 754                        nir_intrinsic_instr *low, nir_intrinsic_instr *high)
 755 {
 756    if ((bit_size != 32 && bit_size != 64) || num_components > 4)
 757       return false;
 758
 759    /* >128 bit loads are split except with SMEM */
 760    if (bit_size * num_components > 128)
 761       return false;
 762
 763    switch (low->intrinsic) {
 764    case nir_intrinsic_load_ubo:
 765    case nir_intrinsic_load_ssbo:
 766    case nir_intrinsic_store_ssbo:
 767    case nir_intrinsic_load_push_constant:
 768       return align % 4 == 0;
 769    case nir_intrinsic_load_deref:
 770    case nir_intrinsic_store_deref:
 771       assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared);
 772       /* fallthrough */
 773    case nir_intrinsic_load_shared:
 774    case nir_intrinsic_store_shared:
 775       if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */
 776          return align % 16 == 0;
 777       else
 778          return align % 4 == 0;
 779    default:
 780       return false;
 781    }
 782    return false;
 783 }
 784
 785 void
 786 setup_vs_output_info(isel_context *ctx, nir_shader *nir,
 787                      bool export_prim_id, bool export_clip_dists,
 788                      radv_vs_output_info *outinfo)
 789 {
 790    memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
 791           sizeof(outinfo->vs_output_param_offset));
 792
 793    outinfo->param_exports = 0;
 794    int pos_written = 0x1;
 795    if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
 796       pos_written |= 1 << 1;
 797
 798    uint64_t mask = ctx->output_masks[nir->info.stage];
 799    while (mask) {
 800       int idx = u_bit_scan64(&mask);
 801       if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
 802           ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
 803          if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
 804             outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
 805       }
 806    }
 807    if (outinfo->writes_layer &&
 808        outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
 809       /* when ctx->options->key.has_multiview_view_index = true, the layer
 810        * variable isn't declared in NIR and it's isel's job to get the layer */
 811       outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
 812    }
 813
 814    if (export_prim_id) {
 815       assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
 816       outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
 817    }
 818
 819    ctx->export_clip_dists = export_clip_dists;
 820    ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
 821    ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
 822
 823    assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
 824
 825    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
 826       pos_written |= 1 << 2;
 827    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
 828       pos_written |= 1 << 3;
 829
 830    outinfo->pos_exports = util_bitcount(pos_written);
 831 }
 832
 833 void
 834 setup_vs_variables(isel_context *ctx, nir_shader *nir)
 835 {
 836    nir_foreach_variable(variable, &nir->inputs)
 837    {
 838       variable->data.driver_location = variable->data.location * 4;
 839    }
 840    nir_foreach_variable(variable, &nir->outputs)
 841    {
 842       if (ctx->stage == vertex_geometry_gs)
 843          variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4;
 844       else if (ctx->stage == vertex_es ||
 845                ctx->stage == vertex_ls ||
 846                ctx->stage == vertex_tess_control_hs)
 847          // TODO: make this more compact
 848          variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4;
 849       else if (ctx->stage == vertex_vs)
 850          variable->data.driver_location = variable->data.location * 4;
 851       else
 852          unreachable("Unsupported VS stage");
 853    }
 854
 855    if (ctx->stage == vertex_vs) {
 856       radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
 857       setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
 858                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 859    } else if (ctx->stage == vertex_geometry_gs || ctx->stage == vertex_es) {
 860       /* TODO: radv_nir_shader_info_pass() already sets this but it's larger
 861        * than it needs to be in order to set it better, we have to improve
 862        * radv_nir_shader_info_pass() because gfx9_get_gs_info() uses
 863        * esgs_itemsize and has to be done before compilation
 864        */
 865       /* radv_es_output_info *outinfo = &ctx->program->info->vs.es_info;
 866       outinfo->esgs_itemsize = util_bitcount64(ctx->output_masks[nir->info.stage]) * 16u; */
 867    }
 868 }
 869
 870 void setup_gs_variables(isel_context *ctx, nir_shader *nir)
 871 {
 872    if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
 873       nir_foreach_variable(variable, &nir->inputs) {
 874          variable->data.driver_location = util_bitcount64(ctx->input_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4;
 875       }
 876    } else if (ctx->stage == geometry_gs) {
 877       //TODO: make this more compact
 878       nir_foreach_variable(variable, &nir->inputs) {
 879          variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot)variable->data.location) * 4;
 880       }
 881    } else {
 882       unreachable("Unsupported GS stage.");
 883    }
 884
 885    nir_foreach_variable(variable, &nir->outputs) {
 886       variable->data.driver_location = variable->data.location * 4;
 887    }
 888
 889    if (ctx->stage == vertex_geometry_gs)
 890       ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;
 891    else if (ctx->stage == tess_eval_geometry_gs)
 892       ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;
 893 }
 894
 895 void
 896 setup_tcs_variables(isel_context *ctx, nir_shader *nir)
 897 {
 898    switch (ctx->stage) {
 899    case tess_control_hs:
 900       ctx->tcs_num_inputs = ctx->args->options->key.tcs.num_inputs;
 901       break;
 902    case vertex_tess_control_hs:
 903       ctx->tcs_num_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
 904       break;
 905    default:
 906       unreachable("Unsupported TCS shader stage");
 907    }
 908
 909    /* When the number of TCS input and output vertices are the same (typically 3):
 910     * - There is an equal amount of LS and HS invocations
 911     * - In case of merged LSHS shaders, the LS and HS halves of the shader
 912     *   always process the exact same vertex. We can use this knowledge to optimize them.
 913     */
 914    ctx->tcs_in_out_eq =
 915       ctx->stage == vertex_tess_control_hs &&
 916       ctx->args->options->key.tcs.input_vertices == nir->info.tess.tcs_vertices_out;
 917
 918    ctx->tcs_num_patches = get_tcs_num_patches(
 919                              ctx->args->options->key.tcs.input_vertices,
 920                              nir->info.tess.tcs_vertices_out,
 921                              ctx->tcs_num_inputs,
 922                              ctx->args->shader_info->tcs.outputs_written,
 923                              ctx->args->shader_info->tcs.patch_outputs_written,
 924                              ctx->args->options->tess_offchip_block_dw_size,
 925                              ctx->args->options->chip_class,
 926                              ctx->args->options->family);
 927    unsigned lds_size = calculate_tess_lds_size(
 928                              ctx->args->options->key.tcs.input_vertices,
 929                              nir->info.tess.tcs_vertices_out,
 930                              ctx->tcs_num_inputs,
 931                              ctx->tcs_num_patches,
 932                              ctx->args->shader_info->tcs.outputs_written,
 933                              ctx->args->shader_info->tcs.patch_outputs_written);
 934
 935    ctx->args->shader_info->tcs.num_patches = ctx->tcs_num_patches;
 936    ctx->args->shader_info->tcs.lds_size = lds_size;
 937    ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) /
 938                                     ctx->program->lds_alloc_granule;
 939
 940    nir_foreach_variable(variable, &nir->inputs) {
 941       variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4;
 942    }
 943
 944    nir_foreach_variable(variable, &nir->outputs) {
 945       variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4;
 946    }
 947
 948    ctx->tcs_tess_lvl_out_loc = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER) * 16u;
 949    ctx->tcs_tess_lvl_in_loc = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER) * 16u;
 950 }
 951
 952 void
 953 setup_tes_variables(isel_context *ctx, nir_shader *nir)
 954 {
 955    ctx->tcs_num_patches = ctx->args->options->key.tes.num_patches;
 956
 957    nir_foreach_variable(variable, &nir->inputs) {
 958       variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4;
 959    }
 960
 961    nir_foreach_variable(variable, &nir->outputs) {
 962       if (ctx->stage == tess_eval_vs)
 963          variable->data.driver_location = variable->data.location * 4;
 964       else if (ctx->stage == tess_eval_es)
 965          variable->data.driver_location = shader_io_get_unique_index((gl_varying_slot) variable->data.location) * 4;
 966       else if (ctx->stage == tess_eval_geometry_gs)
 967          variable->data.driver_location = util_bitcount64(ctx->output_masks[nir->info.stage] & ((1ull << variable->data.location) - 1ull)) * 4;
 968       else
 969          unreachable("Unsupported TES shader stage");
 970    }
 971
 972    if (ctx->stage == tess_eval_vs) {
 973       radv_vs_output_info *outinfo = &ctx->program->info->tes.outinfo;
 974       setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
 975                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 976    }
 977 }
 978
 979 void
 980 setup_variables(isel_context *ctx, nir_shader *nir)
 981 {
 982    switch (nir->info.stage) {
 983    case MESA_SHADER_FRAGMENT: {
 984       nir_foreach_variable(variable, &nir->outputs)
 985       {
 986          int idx = variable->data.location + variable->data.index;
 987          variable->data.driver_location = idx * 4;
 988       }
 989       break;
 990    }
 991    case MESA_SHADER_COMPUTE: {
 992       ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) /
 993                                        ctx->program->lds_alloc_granule;
 994       break;
 995    }
 996    case MESA_SHADER_VERTEX: {
 997       setup_vs_variables(ctx, nir);
 998       break;
 999    }
1000    case MESA_SHADER_GEOMETRY: {
1001       setup_gs_variables(ctx, nir);
1002       break;
1003    }
1004    case MESA_SHADER_TESS_CTRL: {
1005       setup_tcs_variables(ctx, nir);
1006       break;
1007    }
1008    case MESA_SHADER_TESS_EVAL: {
1009       setup_tes_variables(ctx, nir);
1010       break;
1011    }
1012    default:
1013       unreachable("Unhandled shader stage.");
1014    }
1015 }
1016
1017 void
1018 get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const *shaders)
1019 {
1020    for (unsigned i = 0; i < shader_count; i++) {
1021       nir_shader *nir = shaders[i];
1022       if (nir->info.stage == MESA_SHADER_COMPUTE)
1023          continue;
1024
1025       uint64_t output_mask = 0;
1026       nir_foreach_variable(variable, &nir->outputs) {
1027          const glsl_type *type = variable->type;
1028          if (nir_is_per_vertex_io(variable, nir->info.stage))
1029             type = type->fields.array;
1030          unsigned slots = type->count_attribute_slots(false);
1031          if (variable->data.compact) {
1032             unsigned component_count = variable->data.location_frac + type->length;
1033             slots = (component_count + 3) / 4;
1034          }
1035          output_mask |= ((1ull << slots) - 1) << variable->data.location;
1036       }
1037
1038       uint64_t input_mask = 0;
1039       nir_foreach_variable(variable, &nir->inputs) {
1040          const glsl_type *type = variable->type;
1041          if (nir_is_per_vertex_io(variable, nir->info.stage))
1042             type = type->fields.array;
1043          unsigned slots = type->count_attribute_slots(false);
1044          if (variable->data.compact) {
1045             unsigned component_count = variable->data.location_frac + type->length;
1046             slots = (component_count + 3) / 4;
1047          }
1048          input_mask |= ((1ull << slots) - 1) << variable->data.location;
1049       }
1050
1051       ctx->output_masks[nir->info.stage] |= output_mask;
1052       if (i + 1 < shader_count)
1053          ctx->input_masks[shaders[i + 1]->info.stage] |= output_mask;
1054
1055       ctx->input_masks[nir->info.stage] |= input_mask;
1056       if (i)
1057          ctx->output_masks[shaders[i - 1]->info.stage] |= input_mask;
1058    }
1059 }
1060
1061 void
1062 setup_nir(isel_context *ctx, nir_shader *nir)
1063 {
1064    Program *program = ctx->program;
1065
1066    /* align and copy constant data */
1067    while (program->constant_data.size() % 4u)
1068       program->constant_data.push_back(0);
1069    ctx->constant_data_offset = program->constant_data.size();
1070    program->constant_data.insert(program->constant_data.end(),
1071                                  (uint8_t*)nir->constant_data,
1072                                  (uint8_t*)nir->constant_data + nir->constant_data_size);
1073
1074    /* the variable setup has to be done before lower_io / CSE */
1075    setup_variables(ctx, nir);
1076
1077    /* optimize and lower memory operations */
1078    bool lower_to_scalar = false;
1079    bool lower_pack = false;
1080    if (nir_opt_load_store_vectorize(nir,
1081                                     (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
1082                                                         nir_var_mem_push_const | nir_var_mem_shared),
1083                                     mem_vectorize_callback)) {
1084       lower_to_scalar = true;
1085       lower_pack = true;
1086    }
1087    if (nir->info.stage != MESA_SHADER_COMPUTE)
1088       nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
1089    nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
1090
1091    if (lower_to_scalar)
1092       nir_lower_alu_to_scalar(nir, NULL, NULL);
1093    if (lower_pack)
1094       nir_lower_pack(nir);
1095
1096    /* lower ALU operations */
1097    // TODO: implement logic64 in aco, it's more effective for sgprs
1098    nir_lower_int64(nir, nir->options->lower_int64_options);
1099
1100    nir_opt_idiv_const(nir, 32);
1101    nir_lower_idiv(nir, nir_lower_idiv_precise);
1102
1103    /* optimize the lowered ALU operations */
1104    bool more_algebraic = true;
1105    while (more_algebraic) {
1106       more_algebraic = false;
1107       NIR_PASS_V(nir, nir_copy_prop);
1108       NIR_PASS_V(nir, nir_opt_dce);
1109       NIR_PASS_V(nir, nir_opt_constant_folding);
1110       NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
1111    }
1112
1113    /* Do late algebraic optimization to turn add(a, neg(b)) back into
1114     * subs, then the mandatory cleanup after algebraic.  Note that it may
1115     * produce fnegs, and if so then we need to keep running to squash
1116     * fneg(fneg(a)).
1117     */
1118    bool more_late_algebraic = true;
1119    while (more_late_algebraic) {
1120       more_late_algebraic = false;
1121       NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
1122       NIR_PASS_V(nir, nir_opt_constant_folding);
1123       NIR_PASS_V(nir, nir_copy_prop);
1124       NIR_PASS_V(nir, nir_opt_dce);
1125       NIR_PASS_V(nir, nir_opt_cse);
1126    }
1127
1128    /* cleanup passes */
1129    nir_lower_load_const_to_scalar(nir);
1130    nir_opt_shrink_load(nir);
1131    nir_move_options move_opts = (nir_move_options)(
1132       nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1133       nir_move_comparisons | nir_move_copies);
1134    nir_opt_sink(nir, move_opts);
1135    nir_opt_move(nir, move_opts);
1136    nir_convert_to_lcssa(nir, true, false);
1137    nir_lower_phis_to_scalar(nir);
1138
1139    nir_function_impl *func = nir_shader_get_entrypoint(nir);
1140    nir_index_ssa_defs(func);
1141 }
1142
1143 isel_context
1144 setup_isel_context(Program* program,
1145                    unsigned shader_count,
1146                    struct nir_shader *const *shaders,
1147                    ac_shader_config* config,
1148                    struct radv_shader_args *args,
1149                    bool is_gs_copy_shader)
1150 {
1151    program->stage = 0;
1152    for (unsigned i = 0; i < shader_count; i++) {
1153       switch (shaders[i]->info.stage) {
1154       case MESA_SHADER_VERTEX:
1155          program->stage |= sw_vs;
1156          break;
1157       case MESA_SHADER_TESS_CTRL:
1158          program->stage |= sw_tcs;
1159          break;
1160       case MESA_SHADER_TESS_EVAL:
1161          program->stage |= sw_tes;
1162          break;
1163       case MESA_SHADER_GEOMETRY:
1164          program->stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs;
1165          break;
1166       case MESA_SHADER_FRAGMENT:
1167          program->stage |= sw_fs;
1168          break;
1169       case MESA_SHADER_COMPUTE:
1170          program->stage |= sw_cs;
1171          break;
1172       default:
1173          unreachable("Shader stage not implemented");
1174       }
1175    }
1176    bool gfx9_plus = args->options->chip_class >= GFX9;
1177    bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
1178    if (program->stage == sw_vs && args->shader_info->vs.as_es)
1179       program->stage |= hw_es;
1180    else if (program->stage == sw_vs && !args->shader_info->vs.as_ls)
1181       program->stage |= hw_vs;
1182    else if (program->stage == sw_gs)
1183       program->stage |= hw_gs;
1184    else if (program->stage == sw_fs)
1185       program->stage |= hw_fs;
1186    else if (program->stage == sw_cs)
1187       program->stage |= hw_cs;
1188    else if (program->stage == sw_gs_copy)
1189       program->stage |= hw_vs;
1190    else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg)
1191       program->stage |= hw_gs;
1192    else if (program->stage == sw_vs && args->shader_info->vs.as_ls)
1193       program->stage |= hw_ls; /* GFX6-8: VS is a Local Shader, when tessellation is used */
1194    else if (program->stage == sw_tcs)
1195       program->stage |= hw_hs; /* GFX6-8: TCS is a Hull Shader */
1196    else if (program->stage == (sw_vs | sw_tcs))
1197       program->stage |= hw_hs; /* GFX9-10: VS+TCS merged into a Hull Shader */
1198    else if (program->stage == sw_tes && !args->shader_info->tes.as_es && !ngg)
1199       program->stage |= hw_vs; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
1200    else if (program->stage == sw_tes && args->shader_info->tes.as_es && !ngg)
1201       program->stage |= hw_es; /* GFX6-8: TES is an Export Shader */
1202    else if (program->stage == (sw_tes | sw_gs) && gfx9_plus && !ngg)
1203       program->stage |= hw_gs; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
1204    else
1205       unreachable("Shader stage not implemented");
1206
1207    program->config = config;
1208    program->info = args->shader_info;
1209    program->chip_class = args->options->chip_class;
1210    program->family = args->options->family;
1211    program->wave_size = args->shader_info->wave_size;
1212    program->lane_mask = program->wave_size == 32 ? s1 : s2;
1213
1214    program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256;
1215    program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768;
1216    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
1217    program->has_16bank_lds = args->options->family == CHIP_KABINI || args->options->family == CHIP_STONEY;
1218
1219    program->vgpr_limit = 256;
1220    program->vgpr_alloc_granule = 3;
1221
1222    if (args->options->chip_class >= GFX10) {
1223       program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
1224       program->sgpr_alloc_granule = 127;
1225       program->sgpr_limit = 106;
1226       program->vgpr_alloc_granule = program->wave_size == 32 ? 7 : 3;
1227    } else if (program->chip_class >= GFX8) {
1228       program->physical_sgprs = 800;
1229       program->sgpr_alloc_granule = 15;
1230       if (args->options->family == CHIP_TONGA || args->options->family == CHIP_ICELAND)
1231          program->sgpr_limit = 94; /* workaround hardware bug */
1232       else
1233          program->sgpr_limit = 102;
1234    } else {
1235       program->physical_sgprs = 512;
1236       program->sgpr_alloc_granule = 7;
1237       program->sgpr_limit = 104;
1238    }
1239
1240    calc_min_waves(program);
1241    program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
1242    program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
1243
1244    isel_context ctx = {};
1245    ctx.program = program;
1246    ctx.args = args;
1247    ctx.options = args->options;
1248    ctx.stage = program->stage;
1249
1250    get_io_masks(&ctx, shader_count, shaders);
1251
1252    unsigned scratch_size = 0;
1253    if (program->stage == gs_copy_vs) {
1254       assert(shader_count == 1);
1255       setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
1256    } else {
1257       for (unsigned i = 0; i < shader_count; i++) {
1258          nir_shader *nir = shaders[i];
1259          setup_nir(&ctx, nir);
1260       }
1261
1262       for (unsigned i = 0; i < shader_count; i++)
1263          scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
1264    }
1265
1266    ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);
1267
1268    ctx.block = ctx.program->create_and_insert_block();
1269    ctx.block->loop_nest_depth = 0;
1270    ctx.block->kind = block_kind_top_level;
1271
1272    return ctx;
1273 }
1274
1275 }