src/amd/compiler/aco_assembler.cpp

   1 #include <vector>
   2 #include <algorithm>
   3
   4 #include "aco_ir.h"
   5 #include "aco_builder.h"
   6 #include "common/sid.h"
   7 #include "ac_shader_util.h"
   8 #include "util/u_math.h"
   9
  10 namespace aco {
  11
  12 struct asm_context {
  13    Program *program;
  14    enum chip_class chip_class;
  15    std::vector<std::pair<int, SOPP_instruction*>> branches;
  16    std::vector<unsigned> constaddrs;
  17    const int16_t* opcode;
  18    // TODO: keep track of branch instructions referring blocks
  19    // and, when emitting the block, correct the offset in instr
  20    asm_context(Program* program) : program(program), chip_class(program->chip_class) {
  21       if (chip_class <= GFX7)
  22          opcode = &instr_info.opcode_gfx7[0];
  23       else if (chip_class <= GFX9)
  24          opcode = &instr_info.opcode_gfx9[0];
  25       else if (chip_class >= GFX10)
  26          opcode = &instr_info.opcode_gfx10[0];
  27    }
  28
  29    int subvector_begin_pos = -1;
  30 };
  31
  32 static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
  33 {
  34    if (sel & sdwa_isra) {
  35       unsigned size = sdwa_rasize & sel;
  36       if (size == 1)
  37          return reg.byte();
  38       else /* size == 2 */
  39          return sdwa_isword | (reg.byte() >> 1);
  40    }
  41    return sel & sdwa_asuint;
  42 }
  43
  44 void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
  45 {
  46    /* lower remaining pseudo-instructions */
  47    if (instr->opcode == aco_opcode::p_constaddr) {
  48       unsigned dest = instr->definitions[0].physReg();
  49       unsigned offset = instr->operands[0].constantValue();
  50
  51       /* s_getpc_b64 dest[0:1] */
  52       uint32_t encoding = (0b101111101 << 23);
  53       uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64];
  54       if (opcode >= 55 && ctx.chip_class <= GFX9) {
  55          assert(ctx.chip_class == GFX9 && opcode < 60);
  56          opcode = opcode - 4;
  57       }
  58       encoding |= dest << 16;
  59       encoding |= opcode << 8;
  60       out.push_back(encoding);
  61
  62       /* s_add_u32 dest[0], dest[0], ... */
  63       encoding = (0b10 << 30);
  64       encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23;
  65       encoding |= dest << 16;
  66       encoding |= dest;
  67       encoding |= 255 << 8;
  68       out.push_back(encoding);
  69       ctx.constaddrs.push_back(out.size());
  70       out.push_back(offset);
  71
  72       /* s_addc_u32 dest[1], dest[1], 0 */
  73       encoding = (0b10 << 30);
  74       encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23;
  75       encoding |= (dest + 1) << 16;
  76       encoding |= dest + 1;
  77       encoding |= 128 << 8;
  78       out.push_back(encoding);
  79       return;
  80    }
  81
  82    uint32_t opcode = ctx.opcode[(int)instr->opcode];
  83    if (opcode == (uint32_t)-1) {
  84       char *out;
  85       size_t outsize;
  86       FILE *memf = open_memstream(&out, &outsize);
  87
  88       fprintf(memf, "Unsupported opcode: ");
  89       aco_print_instr(instr, memf);
  90       fclose(memf);
  91
  92       aco_err(ctx.program, out);
  93       free(out);
  94
  95       abort();
  96    }
  97
  98    switch (instr->format) {
  99    case Format::SOP2: {
 100       uint32_t encoding = (0b10 << 30);
 101       encoding |= opcode << 23;
 102       encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
 103       encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0;
 104       encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
 105       out.push_back(encoding);
 106       break;
 107    }
 108    case Format::SOPK: {
 109       SOPK_instruction *sopk = static_cast<SOPK_instruction*>(instr);
 110
 111       if (instr->opcode == aco_opcode::s_subvector_loop_begin) {
 112          assert(ctx.chip_class >= GFX10);
 113          assert(ctx.subvector_begin_pos == -1);
 114          ctx.subvector_begin_pos = out.size();
 115       } else if (instr->opcode == aco_opcode::s_subvector_loop_end) {
 116          assert(ctx.chip_class >= GFX10);
 117          assert(ctx.subvector_begin_pos != -1);
 118          /* Adjust s_subvector_loop_begin instruction to the address after the end  */
 119          out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos);
 120          /* Adjust s_subvector_loop_end instruction to the address after the beginning  */
 121          sopk->imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size());
 122          ctx.subvector_begin_pos = -1;
 123       }
 124
 125       uint32_t encoding = (0b1011 << 28);
 126       encoding |= opcode << 23;
 127       encoding |=
 128          !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
 129          instr->definitions[0].physReg() << 16 :
 130          !instr->operands.empty() && instr->operands[0].physReg() <= 127 ?
 131          instr->operands[0].physReg() << 16 : 0;
 132       encoding |= sopk->imm;
 133       out.push_back(encoding);
 134       break;
 135    }
 136    case Format::SOP1: {
 137       uint32_t encoding = (0b101111101 << 23);
 138       if (opcode >= 55 && ctx.chip_class <= GFX9) {
 139          assert(ctx.chip_class == GFX9 && opcode < 60);
 140          opcode = opcode - 4;
 141       }
 142       encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
 143       encoding |= opcode << 8;
 144       encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
 145       out.push_back(encoding);
 146       break;
 147    }
 148    case Format::SOPC: {
 149       uint32_t encoding = (0b101111110 << 23);
 150       encoding |= opcode << 16;
 151       encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0;
 152       encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
 153       out.push_back(encoding);
 154       break;
 155    }
 156    case Format::SOPP: {
 157       SOPP_instruction* sopp = static_cast<SOPP_instruction*>(instr);
 158       uint32_t encoding = (0b101111111 << 23);
 159       encoding |= opcode << 16;
 160       encoding |= (uint16_t) sopp->imm;
 161       if (sopp->block != -1) {
 162          sopp->pass_flags = 0;
 163          ctx.branches.emplace_back(out.size(), sopp);
 164       }
 165       out.push_back(encoding);
 166       break;
 167    }
 168    case Format::SMEM: {
 169       SMEM_instruction* smem = static_cast<SMEM_instruction*>(instr);
 170       bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
 171       bool is_load = !instr->definitions.empty();
 172       uint32_t encoding = 0;
 173
 174       if (ctx.chip_class <= GFX7) {
 175          encoding = (0b11000 << 27);
 176          encoding |= opcode << 22;
 177          encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0;
 178          encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0;
 179          if (instr->operands.size() >= 2) {
 180             if (!instr->operands[1].isConstant() || instr->operands[1].constantValue() >= 1024) {
 181                encoding |= instr->operands[1].physReg().reg();
 182             } else {
 183                encoding |= instr->operands[1].constantValue() >> 2;
 184                encoding |= 1 << 8;
 185             }
 186          }
 187          out.push_back(encoding);
 188          /* SMRD instructions can take a literal on GFX6 & GFX7 */
 189          if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024)
 190             out.push_back(instr->operands[1].constantValue() >> 2);
 191          return;
 192       }
 193
 194       if (ctx.chip_class <= GFX9) {
 195          encoding = (0b110000 << 26);
 196          assert(!smem->dlc); /* Device-level coherent is not supported on GFX9 and lower */
 197          encoding |= smem->nv ? 1 << 15 : 0;
 198       } else {
 199          encoding = (0b111101 << 26);
 200          assert(!smem->nv); /* Non-volatile is not supported on GFX10 */
 201          encoding |= smem->dlc ? 1 << 14 : 0;
 202       }
 203
 204       encoding |= opcode << 18;
 205       encoding |= smem->glc ? 1 << 16 : 0;
 206
 207       if (ctx.chip_class <= GFX9) {
 208          if (instr->operands.size() >= 2)
 209             encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */
 210       }
 211       if (ctx.chip_class == GFX9) {
 212          encoding |= soe ? 1 << 14 : 0;
 213       }
 214
 215       if (is_load || instr->operands.size() >= 3) { /* SDATA */
 216          encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6;
 217       }
 218       if (instr->operands.size() >= 1) { /* SBASE */
 219          encoding |= instr->operands[0].physReg() >> 1;
 220       }
 221
 222       out.push_back(encoding);
 223       encoding = 0;
 224
 225       int32_t offset = 0;
 226       uint32_t soffset = ctx.chip_class >= GFX10
 227                          ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
 228                          : 0;        /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
 229       if (instr->operands.size() >= 2) {
 230          const Operand &op_off1 = instr->operands[1];
 231          if (ctx.chip_class <= GFX9) {
 232             offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
 233          } else {
 234             /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
 235             if (op_off1.isConstant()) {
 236                offset = op_off1.constantValue();
 237             } else {
 238                soffset = op_off1.physReg();
 239                assert(!soe); /* There is no place to put the other SGPR offset, if any */
 240             }
 241          }
 242
 243          if (soe) {
 244             const Operand &op_off2 = instr->operands.back();
 245             assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
 246             assert(!op_off2.isConstant());
 247             soffset = op_off2.physReg();
 248          }
 249       }
 250       encoding |= offset;
 251       encoding |= soffset << 25;
 252
 253       out.push_back(encoding);
 254       return;
 255    }
 256    case Format::VOP2: {
 257       uint32_t encoding = 0;
 258       encoding |= opcode << 25;
 259       encoding |= (0xFF & instr->definitions[0].physReg()) << 17;
 260       encoding |= (0xFF & instr->operands[1].physReg()) << 9;
 261       encoding |= instr->operands[0].physReg();
 262       out.push_back(encoding);
 263       break;
 264    }
 265    case Format::VOP1: {
 266       uint32_t encoding = (0b0111111 << 25);
 267       if (!instr->definitions.empty())
 268          encoding |= (0xFF & instr->definitions[0].physReg()) << 17;
 269       encoding |= opcode << 9;
 270       if (!instr->operands.empty())
 271          encoding |= instr->operands[0].physReg();
 272       out.push_back(encoding);
 273       break;
 274    }
 275    case Format::VOPC: {
 276       uint32_t encoding = (0b0111110 << 25);
 277       encoding |= opcode << 17;
 278       encoding |= (0xFF & instr->operands[1].physReg()) << 9;
 279       encoding |= instr->operands[0].physReg();
 280       out.push_back(encoding);
 281       break;
 282    }
 283    case Format::VINTRP: {
 284       Interp_instruction* interp = static_cast<Interp_instruction*>(instr);
 285       uint32_t encoding = 0;
 286
 287       if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
 288           instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
 289           instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
 290           instr->opcode == aco_opcode::v_interp_p2_f16) {
 291          if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
 292             encoding = (0b110100 << 26);
 293          } else if (ctx.chip_class >= GFX10) {
 294             encoding = (0b110101 << 26);
 295          } else {
 296             unreachable("Unknown chip_class.");
 297          }
 298
 299          encoding |= opcode << 16;
 300          encoding |= (0xFF & instr->definitions[0].physReg());
 301          out.push_back(encoding);
 302
 303          encoding = 0;
 304          encoding |= interp->attribute;
 305          encoding |= interp->component << 6;
 306          encoding |= instr->operands[0].physReg() << 9;
 307          if (instr->opcode == aco_opcode::v_interp_p2_f16 ||
 308              instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
 309              instr->opcode == aco_opcode::v_interp_p1lv_f16) {
 310             encoding |= instr->operands[2].physReg() << 18;
 311          }
 312          out.push_back(encoding);
 313       } else {
 314          if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
 315             encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */
 316          } else {
 317             encoding = (0b110010 << 26);
 318          }
 319
 320          assert(encoding);
 321          encoding |= (0xFF & instr->definitions[0].physReg()) << 18;
 322          encoding |= opcode << 16;
 323          encoding |= interp->attribute << 10;
 324          encoding |= interp->component << 8;
 325          if (instr->opcode == aco_opcode::v_interp_mov_f32)
 326             encoding |= (0x3 & instr->operands[0].constantValue());
 327          else
 328             encoding |= (0xFF & instr->operands[0].physReg());
 329          out.push_back(encoding);
 330       }
 331       break;
 332    }
 333    case Format::DS: {
 334       DS_instruction* ds = static_cast<DS_instruction*>(instr);
 335       uint32_t encoding = (0b110110 << 26);
 336       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
 337          encoding |= opcode << 17;
 338          encoding |= (ds->gds ? 1 : 0) << 16;
 339       } else {
 340          encoding |= opcode << 18;
 341          encoding |= (ds->gds ? 1 : 0) << 17;
 342       }
 343       encoding |= ((0xFF & ds->offset1) << 8);
 344       encoding |= (0xFFFF & ds->offset0);
 345       out.push_back(encoding);
 346       encoding = 0;
 347       unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
 348       encoding |= (0xFF & reg) << 24;
 349       reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)  ? instr->operands[2].physReg() : 0;
 350       encoding |= (0xFF & reg) << 16;
 351       reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
 352       encoding |= (0xFF & reg) << 8;
 353       encoding |= (0xFF & instr->operands[0].physReg());
 354       out.push_back(encoding);
 355       break;
 356    }
 357    case Format::MUBUF: {
 358       MUBUF_instruction* mubuf = static_cast<MUBUF_instruction*>(instr);
 359       uint32_t encoding = (0b111000 << 26);
 360       encoding |= opcode << 18;
 361       encoding |= (mubuf->lds ? 1 : 0) << 16;
 362       encoding |= (mubuf->glc ? 1 : 0) << 14;
 363       encoding |= (mubuf->idxen ? 1 : 0) << 13;
 364       assert(!mubuf->addr64 || ctx.chip_class <= GFX7);
 365       if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7)
 366          encoding |= (mubuf->addr64 ? 1 : 0) << 15;
 367       encoding |= (mubuf->offen ? 1 : 0) << 12;
 368       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
 369          assert(!mubuf->dlc); /* Device-level coherent is not supported on GFX9 and lower */
 370          encoding |= (mubuf->slc ? 1 : 0) << 17;
 371       } else if (ctx.chip_class >= GFX10) {
 372          encoding |= (mubuf->dlc ? 1 : 0) << 15;
 373       }
 374       encoding |= 0x0FFF & mubuf->offset;
 375       out.push_back(encoding);
 376       encoding = 0;
 377       if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) {
 378          encoding |= (mubuf->slc ? 1 : 0) << 22;
 379       }
 380       encoding |= instr->operands[2].physReg() << 24;
 381       encoding |= (mubuf->tfe ? 1 : 0) << 23;
 382       encoding |= (instr->operands[0].physReg() >> 2) << 16;
 383       unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
 384       encoding |= (0xFF & reg) << 8;
 385       encoding |= (0xFF & instr->operands[1].physReg());
 386       out.push_back(encoding);
 387       break;
 388    }
 389    case Format::MTBUF: {
 390       MTBUF_instruction* mtbuf = static_cast<MTBUF_instruction*>(instr);
 391
 392       uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf->dfmt, mtbuf->nfmt);
 393       uint32_t encoding = (0b111010 << 26);
 394       assert(img_format <= 0x7F);
 395       assert(!mtbuf->dlc || ctx.chip_class >= GFX10);
 396       encoding |= (mtbuf->dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
 397       encoding |= (mtbuf->glc ? 1 : 0) << 14;
 398       encoding |= (mtbuf->idxen ? 1 : 0) << 13;
 399       encoding |= (mtbuf->offen ? 1 : 0) << 12;
 400       encoding |= 0x0FFF & mtbuf->offset;
 401       encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
 402
 403       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
 404          encoding |= opcode << 15;
 405       } else {
 406          encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */
 407       }
 408
 409       out.push_back(encoding);
 410       encoding = 0;
 411
 412       encoding |= instr->operands[2].physReg() << 24;
 413       encoding |= (mtbuf->tfe ? 1 : 0) << 23;
 414       encoding |= (mtbuf->slc ? 1 : 0) << 22;
 415       encoding |= (instr->operands[0].physReg() >> 2) << 16;
 416       unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
 417       encoding |= (0xFF & reg) << 8;
 418       encoding |= (0xFF & instr->operands[1].physReg());
 419
 420       if (ctx.chip_class >= GFX10) {
 421          encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */
 422       }
 423
 424       out.push_back(encoding);
 425       break;
 426    }
 427    case Format::MIMG: {
 428       MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
 429       uint32_t encoding = (0b111100 << 26);
 430       encoding |= mimg->slc ? 1 << 25 : 0;
 431       encoding |= opcode << 18;
 432       encoding |= mimg->lwe ? 1 << 17 : 0;
 433       encoding |= mimg->tfe ? 1 << 16 : 0;
 434       encoding |= mimg->glc ? 1 << 13 : 0;
 435       encoding |= mimg->unrm ? 1 << 12 : 0;
 436       if (ctx.chip_class <= GFX9) {
 437          assert(!mimg->dlc); /* Device-level coherent is not supported on GFX9 and lower */
 438          assert(!mimg->r128);
 439          encoding |= mimg->a16 ? 1 << 15 : 0;
 440          encoding |= mimg->da ? 1 << 14 : 0;
 441       } else {
 442          encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
 443          encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */
 444          encoding |= mimg->dlc ? 1 << 7 : 0;
 445       }
 446       encoding |= (0xF & mimg->dmask) << 8;
 447       out.push_back(encoding);
 448       encoding = (0xFF & instr->operands[2].physReg()); /* VADDR */
 449       if (!instr->definitions.empty()) {
 450          encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */
 451       } else if (instr->operands[1].regClass().type() == RegType::vgpr) {
 452          encoding |= (0xFF & instr->operands[1].physReg()) << 8; /* VDATA */
 453       }
 454       encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */
 455       if (instr->operands[1].regClass().type() == RegType::sgpr)
 456          encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */
 457
 458       assert(!mimg->d16 || ctx.chip_class >= GFX9);
 459       encoding |= mimg->d16 ? 1 << 15 : 0;
 460       if (ctx.chip_class >= GFX10) {
 461          encoding |= mimg->a16 ? 1 << 14 : 0; /* GFX10: A16 still exists, but is in a different place */
 462       }
 463
 464       out.push_back(encoding);
 465       break;
 466    }
 467    case Format::FLAT:
 468    case Format::SCRATCH:
 469    case Format::GLOBAL: {
 470       FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
 471       uint32_t encoding = (0b110111 << 26);
 472       encoding |= opcode << 18;
 473       if (ctx.chip_class <= GFX9) {
 474          assert(flat->offset <= 0x1fff);
 475          encoding |= flat->offset & 0x1fff;
 476       } else if (instr->format == Format::FLAT) {
 477          /* GFX10 has a 12-bit immediate OFFSET field,
 478           * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug
 479           */
 480          assert(flat->offset == 0);
 481       } else {
 482          assert(flat->offset <= 0xfff);
 483          encoding |= flat->offset & 0xfff;
 484       }
 485       if (instr->format == Format::SCRATCH)
 486          encoding |= 1 << 14;
 487       else if (instr->format == Format::GLOBAL)
 488          encoding |= 2 << 14;
 489       encoding |= flat->lds ? 1 << 13 : 0;
 490       encoding |= flat->glc ? 1 << 16 : 0;
 491       encoding |= flat->slc ? 1 << 17 : 0;
 492       if (ctx.chip_class >= GFX10) {
 493          assert(!flat->nv);
 494          encoding |= flat->dlc ? 1 << 12 : 0;
 495       } else {
 496          assert(!flat->dlc);
 497       }
 498       out.push_back(encoding);
 499       encoding = (0xFF & instr->operands[0].physReg());
 500       if (!instr->definitions.empty())
 501          encoding |= (0xFF & instr->definitions[0].physReg()) << 24;
 502       if (instr->operands.size() >= 3)
 503          encoding |= (0xFF & instr->operands[2].physReg()) << 8;
 504       if (!instr->operands[1].isUndefined()) {
 505          assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
 506          assert(instr->format != Format::FLAT);
 507          encoding |= instr->operands[1].physReg() << 16;
 508       } else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
 509          if (ctx.chip_class <= GFX9)
 510             encoding |= 0x7F << 16;
 511          else
 512             encoding |= sgpr_null << 16;
 513       }
 514       encoding |= flat->nv ? 1 << 23 : 0;
 515       out.push_back(encoding);
 516       break;
 517    }
 518    case Format::EXP: {
 519       Export_instruction* exp = static_cast<Export_instruction*>(instr);
 520       uint32_t encoding;
 521       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
 522          encoding = (0b110001 << 26);
 523       } else {
 524          encoding = (0b111110 << 26);
 525       }
 526
 527       encoding |= exp->valid_mask ? 0b1 << 12 : 0;
 528       encoding |= exp->done ? 0b1 << 11 : 0;
 529       encoding |= exp->compressed ? 0b1 << 10 : 0;
 530       encoding |= exp->dest << 4;
 531       encoding |= exp->enabled_mask;
 532       out.push_back(encoding);
 533       encoding = 0xFF & exp->operands[0].physReg();
 534       encoding |= (0xFF & exp->operands[1].physReg()) << 8;
 535       encoding |= (0xFF & exp->operands[2].physReg()) << 16;
 536       encoding |= (0xFF & exp->operands[3].physReg()) << 24;
 537       out.push_back(encoding);
 538       break;
 539    }
 540    case Format::PSEUDO:
 541    case Format::PSEUDO_BARRIER:
 542       if (instr->opcode != aco_opcode::p_unit_test)
 543          unreachable("Pseudo instructions should be lowered before assembly.");
 544       break;
 545    default:
 546       if ((uint16_t) instr->format & (uint16_t) Format::VOP3A) {
 547          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr);
 548
 549          if ((uint16_t) instr->format & (uint16_t) Format::VOP2) {
 550             opcode = opcode + 0x100;
 551          } else if ((uint16_t) instr->format & (uint16_t) Format::VOP1) {
 552             if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9)
 553                opcode = opcode + 0x140;
 554             else
 555                opcode = opcode + 0x180;
 556          } else if ((uint16_t) instr->format & (uint16_t) Format::VOPC) {
 557             opcode = opcode + 0x0;
 558          } else if ((uint16_t) instr->format & (uint16_t) Format::VINTRP) {
 559             opcode = opcode + 0x270;
 560          }
 561
 562          uint32_t encoding;
 563          if (ctx.chip_class <= GFX9) {
 564             encoding = (0b110100 << 26);
 565          } else if (ctx.chip_class >= GFX10) {
 566             encoding = (0b110101 << 26);
 567          } else {
 568             unreachable("Unknown chip_class.");
 569          }
 570
 571          if (ctx.chip_class <= GFX7) {
 572             encoding |= opcode << 17;
 573             encoding |= (vop3->clamp ? 1 : 0) << 11;
 574          } else {
 575             encoding |= opcode << 16;
 576             encoding |= (vop3->clamp ? 1 : 0) << 15;
 577          }
 578          encoding |= vop3->opsel << 11;
 579          for (unsigned i = 0; i < 3; i++)
 580             encoding |= vop3->abs[i] << (8+i);
 581          if (instr->definitions.size() == 2)
 582             encoding |= instr->definitions[1].physReg() << 8;
 583          encoding |= (0xFF & instr->definitions[0].physReg());
 584          out.push_back(encoding);
 585          encoding = 0;
 586          if (instr->opcode == aco_opcode::v_interp_mov_f32) {
 587             encoding = 0x3 & instr->operands[0].constantValue();
 588          } else {
 589             for (unsigned i = 0; i < instr->operands.size(); i++)
 590                encoding |= instr->operands[i].physReg() << (i * 9);
 591          }
 592          encoding |= vop3->omod << 27;
 593          for (unsigned i = 0; i < 3; i++)
 594             encoding |= vop3->neg[i] << (29+i);
 595          out.push_back(encoding);
 596
 597       } else if (instr->format == Format::VOP3P) {
 598          VOP3P_instruction* vop3 = static_cast<VOP3P_instruction*>(instr);
 599
 600          uint32_t encoding;
 601          if (ctx.chip_class == GFX9) {
 602             encoding = (0b110100111 << 23);
 603          } else if (ctx.chip_class >= GFX10) {
 604             encoding = (0b110011 << 26);
 605          } else {
 606             unreachable("Unknown chip_class.");
 607          }
 608
 609          encoding |= opcode << 16;
 610          encoding |= (vop3->clamp ? 1 : 0) << 15;
 611          encoding |= vop3->opsel_lo << 11;
 612          encoding |= (vop3->opsel_hi & 0x4) ? 1 : 0 << 14;
 613          for (unsigned i = 0; i < 3; i++)
 614             encoding |= vop3->neg_hi[i] << (8+i);
 615          encoding |= (0xFF & instr->definitions[0].physReg());
 616          out.push_back(encoding);
 617          encoding = 0;
 618          for (unsigned i = 0; i < instr->operands.size(); i++)
 619             encoding |= instr->operands[i].physReg() << (i * 9);
 620          encoding |= vop3->opsel_hi & 0x3 << 27;
 621          for (unsigned i = 0; i < 3; i++)
 622             encoding |= vop3->neg_lo[i] << (29+i);
 623          out.push_back(encoding);
 624
 625       } else if (instr->isDPP()){
 626          assert(ctx.chip_class >= GFX8);
 627          /* first emit the instruction without the DPP operand */
 628          Operand dpp_op = instr->operands[0];
 629          instr->operands[0] = Operand(PhysReg{250}, v1);
 630          instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP);
 631          emit_instruction(ctx, out, instr);
 632          DPP_instruction* dpp = static_cast<DPP_instruction*>(instr);
 633          uint32_t encoding = (0xF & dpp->row_mask) << 28;
 634          encoding |= (0xF & dpp->bank_mask) << 24;
 635          encoding |= dpp->abs[1] << 23;
 636          encoding |= dpp->neg[1] << 22;
 637          encoding |= dpp->abs[0] << 21;
 638          encoding |= dpp->neg[0] << 20;
 639          if (ctx.chip_class >= GFX10)
 640             encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */
 641          encoding |= dpp->bound_ctrl << 19;
 642          encoding |= dpp->dpp_ctrl << 8;
 643          encoding |= (0xFF) & dpp_op.physReg();
 644          out.push_back(encoding);
 645          return;
 646       } else if (instr->isSDWA()) {
 647          /* first emit the instruction without the SDWA operand */
 648          Operand sdwa_op = instr->operands[0];
 649          instr->operands[0] = Operand(PhysReg{249}, v1);
 650          instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA);
 651          emit_instruction(ctx, out, instr);
 652
 653          SDWA_instruction* sdwa = static_cast<SDWA_instruction*>(instr);
 654          uint32_t encoding = 0;
 655
 656          if ((uint16_t)instr->format & (uint16_t)Format::VOPC) {
 657             if (instr->definitions[0].physReg() != vcc) {
 658                encoding |= instr->definitions[0].physReg() << 8;
 659                encoding |= 1 << 15;
 660             }
 661             encoding |= (sdwa->clamp ? 1 : 0) << 13;
 662          } else {
 663             encoding |= get_sdwa_sel(sdwa->dst_sel, instr->definitions[0].physReg()) << 8;
 664             uint32_t dst_u = sdwa->dst_sel & sdwa_sext ? 1 : 0;
 665             if (sdwa->dst_preserve || (sdwa->dst_sel & sdwa_isra))
 666                dst_u = 2;
 667             encoding |= dst_u << 11;
 668             encoding |= (sdwa->clamp ? 1 : 0) << 13;
 669             encoding |= sdwa->omod << 14;
 670          }
 671
 672          encoding |= get_sdwa_sel(sdwa->sel[0], sdwa_op.physReg()) << 16;
 673          encoding |= sdwa->sel[0] & sdwa_sext ? 1 << 19 : 0;
 674          encoding |= sdwa->abs[0] << 21;
 675          encoding |= sdwa->neg[0] << 20;
 676
 677          if (instr->operands.size() >= 2) {
 678             encoding |= get_sdwa_sel(sdwa->sel[1], instr->operands[1].physReg()) << 24;
 679             encoding |= sdwa->sel[1] & sdwa_sext ? 1 << 27 : 0;
 680             encoding |= sdwa->abs[1] << 29;
 681             encoding |= sdwa->neg[1] << 28;
 682          }
 683
 684          encoding |= 0xFF & sdwa_op.physReg();
 685          encoding |= (sdwa_op.physReg() < 256) << 23;
 686          if (instr->operands.size() >= 2)
 687             encoding |= (instr->operands[1].physReg() < 256) << 31;
 688          out.push_back(encoding);
 689       } else {
 690          unreachable("unimplemented instruction format");
 691       }
 692       break;
 693    }
 694
 695    /* append literal dword */
 696    for (const Operand& op : instr->operands) {
 697       if (op.isLiteral()) {
 698          out.push_back(op.constantValue());
 699          break;
 700       }
 701    }
 702 }
 703
 704 void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
 705 {
 706    for (aco_ptr<Instruction>& instr : block.instructions) {
 707 #if 0
 708       int start_idx = out.size();
 709       std::cerr << "Encoding:\t" << std::endl;
 710       aco_print_instr(&*instr, stderr);
 711       std::cerr << std::endl;
 712 #endif
 713       emit_instruction(ctx, out, instr.get());
 714 #if 0
 715       for (int i = start_idx; i < out.size(); i++)
 716          std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl;
 717 #endif
 718    }
 719 }
 720
 721 void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
 722 {
 723    bool exported = false;
 724    for (Block& block : program->blocks) {
 725       if (!(block.kind & block_kind_export_end))
 726          continue;
 727       std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
 728       while ( it != block.instructions.rend())
 729       {
 730          if ((*it)->format == Format::EXP) {
 731             Export_instruction* exp = static_cast<Export_instruction*>((*it).get());
 732             if (program->stage & (hw_vs | hw_ngg_gs)) {
 733                if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) {
 734                   exp->done = true;
 735                   exported = true;
 736                   break;
 737                }
 738             } else {
 739                exp->done = true;
 740                exp->valid_mask = true;
 741                exported = true;
 742                break;
 743             }
 744          } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec)
 745             break;
 746          ++it;
 747       }
 748    }
 749
 750    if (!exported) {
 751       /* Abort in order to avoid a GPU hang. */
 752       aco_err(program, "Missing export in %s shader:", (program->stage & hw_vs) ? "vertex" : "fragment");
 753       aco_print_program(program, stderr);
 754       abort();
 755    }
 756 }
 757
 758 static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
 759                         unsigned insert_count, const uint32_t *insert_data)
 760 {
 761    out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
 762
 763    /* Update the offset of each affected block */
 764    for (Block& block : ctx.program->blocks) {
 765       if (block.offset >= insert_before)
 766          block.offset += insert_count;
 767    }
 768
 769    /* Find first branch after the inserted code */
 770    auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool {
 771       return (unsigned)branch.first >= insert_before;
 772    });
 773
 774    /* Update the locations of branches */
 775    for (; branch_it != ctx.branches.end(); ++branch_it)
 776       branch_it->first += insert_count;
 777
 778    /* Find first constant address after the inserted code */
 779    auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [insert_before](const int &caddr_pos) -> bool {
 780       return (unsigned)caddr_pos >= insert_before;
 781    });
 782
 783    /* Update the locations of constant addresses */
 784    for (; caddr_it != ctx.constaddrs.end(); ++caddr_it)
 785       (*caddr_it) += insert_count;
 786 }
 787
 788 static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
 789 {
 790    /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
 791    bool gfx10_3f_bug = false;
 792
 793    do {
 794       auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool {
 795          return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f;
 796       });
 797
 798       gfx10_3f_bug = buggy_branch_it != ctx.branches.end();
 799
 800       if (gfx10_3f_bug) {
 801          /* Insert an s_nop after the branch */
 802          constexpr uint32_t s_nop_0 = 0xbf800000u;
 803          insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0);
 804       }
 805    } while (gfx10_3f_bug);
 806 }
 807
 808 void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector<uint32_t>& out)
 809 {
 810    Builder bld(ctx.program);
 811
 812    Definition def_tmp_lo(branch->definitions[0].physReg(), s1);
 813    Operand op_tmp_lo(branch->definitions[0].physReg(), s1);
 814    Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
 815    Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
 816
 817    aco_ptr<Instruction> instr;
 818
 819    if (branch->opcode != aco_opcode::s_branch) {
 820       /* for conditional branches, skip the long jump if the condition is false */
 821       aco_opcode inv;
 822       switch (branch->opcode) {
 823       case aco_opcode::s_cbranch_scc0:
 824          inv = aco_opcode::s_cbranch_scc1;
 825          break;
 826       case aco_opcode::s_cbranch_scc1:
 827          inv = aco_opcode::s_cbranch_scc0;
 828          break;
 829       case aco_opcode::s_cbranch_vccz:
 830          inv = aco_opcode::s_cbranch_vccnz;
 831          break;
 832       case aco_opcode::s_cbranch_vccnz:
 833          inv = aco_opcode::s_cbranch_vccz;
 834          break;
 835       case aco_opcode::s_cbranch_execz:
 836          inv = aco_opcode::s_cbranch_execnz;
 837          break;
 838       case aco_opcode::s_cbranch_execnz:
 839          inv = aco_opcode::s_cbranch_execz;
 840          break;
 841       default:
 842          unreachable("Unhandled long jump.");
 843       }
 844       instr.reset(bld.sopp(inv, -1, 7));
 845       emit_instruction(ctx, out, instr.get());
 846    }
 847
 848    /* create the new PC and stash SCC in the LSB */
 849    instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr);
 850    emit_instruction(ctx, out, instr.get());
 851
 852    instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand(0u)).instr);
 853    instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */
 854    emit_instruction(ctx, out, instr.get());
 855    branch->pass_flags = out.size();
 856
 857    instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr);
 858    emit_instruction(ctx, out, instr.get());
 859
 860    /* restore SCC and clear the LSB of the new PC */
 861    instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand(0u)).instr);
 862    emit_instruction(ctx, out, instr.get());
 863    instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand(0u)).instr);
 864    emit_instruction(ctx, out, instr.get());
 865
 866    /* create the s_setpc_b64 to jump */
 867    instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
 868    emit_instruction(ctx, out, instr.get());
 869 }
 870
 871 void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
 872 {
 873    bool repeat = false;
 874    do {
 875       repeat = false;
 876
 877       if (ctx.chip_class == GFX10)
 878          fix_branches_gfx10(ctx, out);
 879
 880       for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
 881          int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
 882          if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
 883             std::vector<uint32_t> long_jump;
 884             bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
 885             emit_long_jump(ctx, branch.second, backwards, long_jump);
 886
 887             out[branch.first] = long_jump[0];
 888             insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1);
 889
 890             repeat = true;
 891             break;
 892          }
 893
 894          if (branch.second->pass_flags) {
 895             int after_getpc = branch.first + branch.second->pass_flags - 2;
 896             offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc;
 897             out[branch.first + branch.second->pass_flags - 1] = offset * 4;
 898          } else {
 899             out[branch.first] &= 0xffff0000u;
 900             out[branch.first] |= (uint16_t) offset;
 901          }
 902       }
 903    } while (repeat);
 904 }
 905
 906 void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
 907 {
 908    for (unsigned addr : ctx.constaddrs)
 909       out[addr] += (out.size() - addr + 1u) * 4u;
 910 }
 911
 912 unsigned emit_program(Program* program,
 913                       std::vector<uint32_t>& code)
 914 {
 915    asm_context ctx(program);
 916
 917    if (program->stage & (hw_vs | hw_fs | hw_ngg_gs))
 918       fix_exports(ctx, code, program);
 919
 920    for (Block& block : program->blocks) {
 921       block.offset = code.size();
 922       emit_block(ctx, code, block);
 923    }
 924
 925    fix_branches(ctx, code);
 926
 927    unsigned exec_size = code.size() * sizeof(uint32_t);
 928
 929    if (program->chip_class >= GFX10) {
 930       /* Pad output with s_code_end so instruction prefetching doesn't cause
 931        * page faults */
 932       unsigned final_size = align(code.size() + 3 * 16, 16);
 933       while (code.size() < final_size)
 934          code.push_back(0xbf9f0000u);
 935    }
 936
 937    fix_constaddrs(ctx, code);
 938
 939    while (program->constant_data.size() % 4u)
 940       program->constant_data.push_back(0);
 941    /* Copy constant data */
 942    code.insert(code.end(), (uint32_t*)program->constant_data.data(),
 943                (uint32_t*)(program->constant_data.data() + program->constant_data.size()));
 944
 945    return exec_size;
 946 }
 947
 948 }